From 154581bf25ee13bd79efe54d0870a416d79c9e55 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Tue, 11 Jun 2024 16:20:08 -0700
Subject: [PATCH 01/17] [matmul] work around object fifo bug; accumulate in
 float for verification; probabilistically verify beyond threshold size; add
 assertions and comments; improve verification output; add workaround comment
 for excessive program size

---
 .../basic/matrix_multiplication/common.h      | 47 +++++++++++-----
 .../basic/matrix_multiplication/test.cpp      | 54 ++++++++++++++----
 .../matrix_multiplication/whole_array/aie2.py | 55 +++++++++++++++----
 3 files changed, 122 insertions(+), 34 deletions(-)
diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index b7386f268c..851eb8ded7 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -130,7 +130,7 @@ void matmul_naive(int M, int N, int K, const std::vector<Tin> A,
   }
 }
 
-template <typename Tin, typename Tout>
+template <typename Tin, typename Tout, typename Tacc>
 void matmul(int M, int N, int K, const std::vector<Tin> A,
             const std::vector<Tin> B, std::vector<Tout> &C) {
   // A is an  MxK matrix
@@ -159,10 +159,10 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
     for (int col = 0; col < N; col++) {
       A_ptr = A_base;
       B_ptr = B_base;
-      Tout running_sum = 0;
+      Tacc running_sum = 0;
       for (int k = 0; k < n_K_blocks; k++) {
         for (int i = 0; i < K_block_size; i++) {
-          running_sum += Tout(*A_ptr) * Tout(*B_ptr);
+          running_sum += Tacc(*A_ptr * *B_ptr);
           A_ptr += 1; // Advance to right neighbor; next value in this row
           B_ptr += N; // Advance to bottom neighbor; next value in this column
         }
@@ -178,14 +178,14 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
   }
 }
 
-template <typename Tin, typename Tout>
+template <typename Tin, typename Tout, typename Tacc>
 Tout mul_acc(int M, int N, int K, int row, int col, const std::vector<Tin> A,
              const std::vector<Tin> B) {
-  Tout running_sum = 0;
+  Tacc running_sum = 0;
   for (int k = 0; k < K; k++) {
-    running_sum += Tout(A[row * K + k] * B[k * N + col]);
+    running_sum += Tacc(A[row * K + k] * B[k * N + col]);
   }
-  return running_sum;
+  return (Tout)running_sum;
 }
 
 // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
@@ -291,7 +291,8 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) {
 
 template <typename Tout>
 void print_error_summary(std::ostream &os, int n_errors,
-                         std::vector<struct error<Tout>> &errors) {
+                         std::vector<struct error<Tout>> &errors,
+                         Tout max_rel_error) {
   for (struct error<Tout> &err : errors) {
     os << "[" << std::setw(5) << err.row << ", " << std::setw(5) << err.col
        << "] " << std::setw(4) << std::setprecision(2) << std::fixed
@@ -302,6 +303,10 @@ void print_error_summary(std::ostream &os, int n_errors,
     os << "...and " << std::setw(0) << n_errors - max_printable_errors
        << " further errors." << std::endl;
   }
+  if (n_errors > 0) {
+    os << "Maximum relative error: " << std::setw(3) << std::setprecision(0)
+       << max_rel_error * 100 << "%" << std::endl;
+  }
 }
 
 void print_progress_bar(std::ostream &os, double progress, int len = 75) {
@@ -311,14 +316,15 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) {
      << "\r";
 }
 
-template <typename Tin, typename Tout>
+template <typename Tin, typename Tout, typename Tacc>
 int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
            std::vector<Tout> C, int verbosity = 0) {
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
+  Tout max_rel_error = (Tout)0.0f;
 
   std::vector<Tout> CRef(M * N);
-  matmul(M, N, K, A, B, CRef);
+  matmul<Tin, Tout, Tacc>(M, N, K, A, B, CRef);
 
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
@@ -328,11 +334,17 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
         if (n_errors < max_printable_errors) {
           errors.push_back(*error);
         }
+        Tout rel_error =
+            std::abs(error->actual - error->expected) /
+            std::max(std::abs(error->actual), std::abs(error->expected));
+        if (rel_error > max_rel_error) {
+          max_rel_error = rel_error;
+        }
         n_errors++;
       }
     }
   }
-  print_error_summary(std::cout, n_errors, errors);
+  print_error_summary(std::cout, n_errors, errors, max_rel_error);
 
   if (n_errors > 0) {
     std::cout << std::endl << "Reference:" << std::endl;
@@ -344,7 +356,7 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
   return n_errors;
 }
 
-template <typename Tin, typename Tout>
+template <typename Tin, typename Tout, typename Tacc>
 int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
                       std::vector<Tin> B, std::vector<Tout> C, int n_samples,
                       int verbosity = 0) {
@@ -359,6 +371,7 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
 
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
+  Tout max_rel_error = (Tout)0.0f;
   double progress = 0;
   for (std::tuple<size_t, std::tuple<int &, int &>> cell :
        std::views::enumerate(std::views::zip(sampled_rows, sampled_cols))) {
@@ -371,19 +384,25 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
       progress = (double)i / n_samples;
       print_progress_bar(std::cerr, progress);
     }
-    Tout ref = mul_acc<Tin, Tout>(M, N, K, row, col, A, B);
+    Tout ref = mul_acc<Tin, Tout, Tacc>(M, N, K, row, col, A, B);
     std::optional<struct error<Tout>> error =
         verify_single(std::cout, row, col, ref, C[row * N + col]);
     if (error.has_value()) {
       if (n_errors < max_printable_errors) {
         errors.push_back(*error);
       }
+      Tout rel_error =
+          std::abs(error->actual - error->expected) /
+          std::max(std::abs(error->actual), std::abs(error->expected));
+      if (rel_error > max_rel_error) {
+        max_rel_error = rel_error;
+      }
       n_errors++;
     }
   }
   std::cout << std::endl;
 
-  print_error_summary(std::cout, n_errors, errors);
+  print_error_summary(std::cout, n_errors, errors, max_rel_error);
   return n_errors;
 }
 
diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp
index fded6f0de7..e3786b0d2a 100644
--- a/programming_examples/basic/matrix_multiplication/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/test.cpp
@@ -31,8 +31,12 @@
 using A_DATATYPE = std::bfloat16_t;
 using B_DATATYPE = std::bfloat16_t;
 using C_DATATYPE = std::bfloat16_t;
+using ACC_DATATYPE = float;
 #endif
 
+constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024;
+constexpr int verify_stochastic_n_samples = 1000;
+
 namespace po = boost::program_options;
 
 int main(int argc, const char *argv[]) {
@@ -54,6 +58,8 @@ int main(int argc, const char *argv[]) {
   int M = vm["M"].as<int>();
   int K = vm["K"].as<int>();
   int N = vm["N"].as<int>();
+  bool do_verify_stochastic =
+      (long long)M * N * K > verify_stochastic_threshold;
 
   if (verbosity >= 1) {
     std::cout << "Matrix size " << M << "x" << K << "x" << N << std::endl;
@@ -140,17 +146,26 @@ int main(int argc, const char *argv[]) {
   std::vector<B_DATATYPE> BVec(B_VOLUME);
   for (int i = 0; i < B_VOLUME; i++) {
     BVec[i] = matmul_common::random_bfloat16_t();
+    // Diagonal:
+    // if(i % N == i / N) {
+    //   BVec[i] = 1.0;
+    // } else {
+    //   BVec[i] = 0.0;
+    // }
   }
   memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE)));
 
   // Initialize outputs; bufOut is results matrix plus tracing info
   char *bufOut = bo_out.map<char *>();
   std::vector<C_DATATYPE> CVec(C_VOLUME);
-  // memcpy(bufOut, CVec.data(), (CVec.size() * sizeof(C_DATATYPE)));
   memset(bufOut, 0, OUT_SIZE);
-  // if(trace_size > 0) {
-  //   memset(bufOut + C_SIZE, 0, trace_size);
-  // }
+
+  if (verbosity >= 2) {
+    std::cout << "A = \n";
+    matmul_common::print_matrix(AVec, K);
+    std::cout << "B = \n";
+    matmul_common::print_matrix(BVec, N);
+  }
 
   // Instruction buffer for DMA configuration
   void *bufInstr = bo_instr.map<void *>();
@@ -172,14 +187,14 @@ int main(int argc, const char *argv[]) {
   for (unsigned iter = 0; iter < num_iter; iter++) {
 
     if (verbosity >= 1) {
-      std::cout << "Running Kernel.\n";
+      std::cout << "Running Kernel (iteration " << iter << ").\n";
     }
     auto start = std::chrono::high_resolution_clock::now();
     unsigned int opcode = 3;
     auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_out);
     ert_cmd_state r = run.wait();
     if (r != ERT_CMD_STATE_COMPLETED) {
-      std::cout << "kernel did not complete. returned status: " << r << "\n";
+      std::cout << "Kernel did not complete. Returned status: " << r << "\n";
       return 1;
     }
     auto stop = std::chrono::high_resolution_clock::now();
@@ -193,16 +208,29 @@ int main(int argc, const char *argv[]) {
     memcpy(CVec.data(), bufOut, (CVec.size() * sizeof(C_DATATYPE)));
     if (do_verify) {
       if (verbosity >= 1) {
-        std::cout << "Verifying against reference matmul ..." << std::endl;
+        if (do_verify_stochastic) {
+          std::cout << "Verifying " << verify_stochastic_n_samples
+                    << " random samples against reference matmul ..."
+                    << std::endl;
+        } else {
+          std::cout << "Verifying against reference matmul ..." << std::endl;
+        }
       }
       auto vstart = std::chrono::system_clock::now();
-      errors = matmul_common::verify(M, N, K, AVec, BVec, CVec);
+      if (do_verify_stochastic) {
+        errors = matmul_common::verify_stochastic<A_DATATYPE, C_DATATYPE,
+                                                  ACC_DATATYPE>(
+            M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity);
+      } else {
+        errors = matmul_common::verify<A_DATATYPE, C_DATATYPE, ACC_DATATYPE>(
+            M, N, K, AVec, BVec, CVec);
+      }
       auto vstop = std::chrono::system_clock::now();
       float vtime =
           std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
               .count();
       if (verbosity >= 1) {
-        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+        std::cout << "Verify time: " << vtime << " s." << std::endl;
       }
     } else {
       if (verbosity >= 1)
@@ -241,7 +269,13 @@ int main(int argc, const char *argv[]) {
     std::cout << "\nPASS!\n\n";
     return 0;
   } else {
-    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nError count: " << errors;
+    if (do_verify_stochastic) {
+      std::cout << " (out of " << verify_stochastic_n_samples
+                << " random samples)";
+    }
+    std::cout << "\n\n";
+
     std::cout << "\nFailed.\n\n";
     return 1;
   }
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 76453f4b94..139d14fd12 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -34,13 +34,44 @@ def my_matmul(M=512, K=512, N=512):
     r = 4
     s = 8
     t = 4
-    word_size_in = 2
-    word_size_out = 2
 
     n_rows = 4
     n_cols = 4
     n_cores = n_rows * n_cols
 
+    # Input matrix A:
+    # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These
+    # blocks are _broadcast_ across AIE core columns, then _distributed_ across
+    # rows, s.t. each of the n_rows compute cores in a column receives a
+    # contiguous (m, k)-sized block of A.
+    assert (
+        M % (m * n_rows) == 0
+    ), """A must be tileable into (m * n_rows, k)-sized blocks"""
+
+    # Both A and B are tiled in the K dimension into size k.
+    assert K % k == 0
+
+    # Input matrix B:
+    # Conceptually, we do the same as with A, but instead of broadcasting
+    # across columns we broadcast across rows and distribute across columns.
+    assert (
+        N % (n * n_cols) == 0
+    ), """B must be tileable into (k, n * n_cols)-sized blocks"""
+
+    # r, s, t are the dimensions required by the microkernel MAC instructions.
+    assert m % r == 0
+    assert k % s == 0
+    assert n % t == 0
+
+    word_size_in = 2
+    word_size_out = 2
+
+    # If you get errors during CDO generation due to running out of program
+    # memory, it may be because too much code is generated due to ObjectFIFO
+    # loop unrollings. Reducing the depth to 1 here will work around that at
+    # a big performance cost.
+    fifo_depth = 2
+
     A_sz_in_i32s = M * K * word_size_in // 4
     B_sz_in_i32s = K * N * word_size_in // 4
     C_sz_in_bytes = M * N * word_size_out
@@ -187,14 +218,14 @@ def device_body():
                     inA_fifo_names[i],
                     shims[i],
                     mems[i],
-                    2,
+                    fifo_depth,
                     memRef_inA_ty,
                 )
                 memA_fifos[memA_fifo_names[i]] = object_fifo(
                     memA_fifo_names[i],
                     mems[i],
                     t_cores[i][0:n_cols],
-                    2,
+                    fifo_depth,
                     memRef_A_ty,
                     [
                         (m // r, r * k),
@@ -211,14 +242,14 @@ def device_body():
                     inB_fifo_names[i],
                     shims[i],
                     mems[i],
-                    2,
+                    fifo_depth,
                     memRef_inB_ty,
                 )
                 memB_fifos[memB_fifo_names[i]] = object_fifo(
                     memB_fifo_names[i],
                     mems[i],
                     cores[i][0:n_rows],
-                    2,
+                    fifo_depth,
                     memRef_B_ty,
                     [
                         (k // s, s * n),
@@ -236,14 +267,14 @@ def device_body():
                         memC_fifo_names[i][j],
                         cores[i][j],
                         mems[i],
-                        2,
+                        fifo_depth,
                         memRef_C_ty,
                     )
                 outC_fifos[outC_fifo_names[i]] = object_fifo(
                     outC_fifo_names[i],
                     mems[i],
                     shims[i],
-                    2,
+                    fifo_depth,
                     memRef_outC_ty,
                     [
                         (m // r, r * n),
@@ -261,7 +292,9 @@ def device_body():
                     @core(cores[j][i], "mm.o")
                     def core_body():
                         for _ in for_(0xFFFFFFFF):
-                            for _ in for_(tiles):
+                            for _ in (
+                                for_(tiles) if tiles > 1 else range(1)
+                            ):  # Workaround for issue #1547
                                 elem_out = memC_fifos[j][memC_fifo_names[j][i]].acquire(
                                     ObjectFifoPort.Produce,
                                     1,
@@ -290,7 +323,9 @@ def core_body():
                                     ObjectFifoPort.Produce, 1
                                 )
                                 yield_([])
-                            yield_([])
+
+                            if tiles > 1:  # workaround for issue #1547
+                                yield_([])
 
             # To/from AIE-array data movement
 

From 08e404a3b11f1a5dab67950a1f256915181a5c1d Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 12:51:02 -0700
Subject: [PATCH 02/17] [matmul] allow modifiable tile size for whole_array

tmp
---
 aie_kernels/aie2/mm.cc                        | 37 ++++++++++++++-----
 .../matrix_multiplication/makefile-common     | 15 ++++----
 .../whole_array/Makefile                      | 13 ++++++-
 .../matrix_multiplication/whole_array/aie2.py | 14 +++----
 4 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc
index 8b7732fdaf..35437afb40 100755
--- a/aie_kernels/aie2/mm.cc
+++ b/aie_kernels/aie2/mm.cc
@@ -24,16 +24,16 @@
 
 #include "zero.cc"
 
-template <typename T_in, typename T_out, int M, int K, int N>
+template <typename T_in, typename T_out, int rowA, int colA, int colB>
 void matmul_scalar(T_in *a, T_in *b, T_out *c) {
   event0();
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col++) {
+  for (int row = 0; row < rowA; row++) {
+    for (int col = 0; col < colB; col++) {
       T_out running_sum = 0;
-      for (int i = 0; i < K; i++) {
-        running_sum += a[row * K + i] * b[i * N + col];
+      for (int i = 0; i < colA; i++) {
+        running_sum += a[row * colA + i] * b[i * colB + col];
       }
-      c[row * N + col] += running_sum;
+      c[row * colB + col] += running_sum;
     }
   }
   event1();
@@ -397,6 +397,23 @@ void matmul_vectorized_4x8x4_bf16_f32(const bfloat16 *__restrict pA,
 
 extern "C" {
 
+// If you want to compile microkernels with different inner tile sizes,
+// define DIM_M, DIM_K and DIM_N at compile time using -DDIM_M 32 etc.
+// These dimensions must be divisible by the r, s, t dimensions used in
+// the kernels.
+
+#ifndef DIM_M
+#define DIM_M 64
+#endif
+
+#ifndef DIM_K
+#define DIM_K 64
+#endif
+
+#ifndef DIM_N
+#define DIM_N 64
+#endif
+
 #define combos(X)                                                              \
   X(int16, i16, int16, i16, 4, 4, 4)                                           \
   X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4)                                   \
@@ -407,26 +424,26 @@ extern "C" {
   void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \
                                                ctype_out *c_out) {             \
     matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out<      \
-        64, 64, 64>(a_in, b_in, c_out);                                        \
+        DIM_M, DIM_K, DIM_N>(a_in, b_in, c_out);                               \
   }
 
 #define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \
                              r, s, t)                                          \
   void matmul_scalar_##mlir_type_in##_##mlir_type_out(                         \
       ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
-    matmul_scalar<ctype_in, ctype_out, 64, 32, 64>(a_in, b_in, c_out);         \
+    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N>(a_in, b_in, c_out);\
   }
 
 #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,              \
                                mlir_type_out, r, s, t)                         \
   void zero_##mlir_type_out(ctype_out *c_out) {                                \
-    zero_vectorized<ctype_out, 64, 64, 32>(c_out);                             \
+    zero_vectorized<ctype_out, DIM_M, DIM_N, 32>(c_out);                       \
   }
 
 #define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out,   \
                            r, s, t)                                            \
   void zero_scalar_##mlir_type_out(ctype_out *c_out) {                         \
-    zero_scalar<ctype_out, 64, 64>(c_out);                                     \
+    zero_scalar<ctype_out, DIM_M, DIM_N>(c_out);                               \
   }
 
 combos(matmul_vectorized_c_func) combos(matmul_scalar_c_func)
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index e92f4e699c..bd1eb11409 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -28,10 +28,7 @@
 #					  N=1 for matrix-vector
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-#include ${CURDIR}/../../makefile-common
 current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-#include ${current_dir}../../makefile-common
-SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
 include ${current_dir}../../makefile-common
 
 # defaults; overwrite if needed
@@ -39,13 +36,15 @@ M?=512
 K?=512
 N?=512
 
-trace_size=65536
+trace_size?=65536
 
-mlir_target?=build/aie_${M}x${K}x${N}.mlir
-xclbin_target?=build/final_${M}x${K}x${N}.xclbin
-insts_target?=build/insts_${M}x${K}x${N}.txt
+target_suffix?=${M}x${K}x${n}
+mlir_target?=build/aie_${target_suffix}.mlir
+xclbin_target?=build/final_${target_suffix}.xclbin
+insts_target?=build/insts_${target_suffix}.txt
 
 runargs?=-v 1 --warmup 1 --iters 1
+aieargs+=-M $M -K $K -N $N
 
 kernels_dir=${srcdir}/../../../../aie_kernels/aie2
 
@@ -58,7 +57,7 @@ build/%.o: ${kernels_dir}/%.cc
 
 ${mlir_target}: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< -M $M -K $K -N $N > $@
+	python3 $< ${aieargs} > $@
 
 ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
 	mkdir -p ${@D}
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/Makefile b/programming_examples/basic/matrix_multiplication/whole_array/Makefile
index 2289d762c6..617c76e975 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/Makefile
+++ b/programming_examples/basic/matrix_multiplication/whole_array/Makefile
@@ -6,13 +6,22 @@
 # 
 ##===----------------------------------------------------------------------===##
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
 subdir=whole_array
 targetname=matrixMultiplication
-kernels=mm
 
 M?=512
 K?=512
 N?=512
+m?=64
+k?=64
+n?=64
+
+kernels=mm_${m}x${k}x${n}
+aieargs+=-m $m -k $k -n $n
+target_suffix=${M}x${K}x${N}_${m}x${k}x${n}
 
 include ${srcdir}/../makefile-common
+
+build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -DDIM_N=${n} -c $< -o ${@F}
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 139d14fd12..022e3b975f 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -23,14 +23,14 @@ def main():
     argparser.add_argument("-M", type=int, default=512)
     argparser.add_argument("-K", type=int, default=512)
     argparser.add_argument("-N", type=int, default=512)
+    argparser.add_argument("-m", type=int, default=64)
+    argparser.add_argument("-k", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=64)
     args = argparser.parse_args()
-    my_matmul(args.M, args.K, args.N)
+    my_matmul(args.M, args.K, args.N, args.m, args.k, args.n)
 
 
-def my_matmul(M=512, K=512, N=512):
-    m = 64
-    k = 64
-    n = 64
+def my_matmul(M, K, N, m, k, n):
     r = 4
     s = 8
     t = 4
@@ -70,7 +70,7 @@ def my_matmul(M=512, K=512, N=512):
     # memory, it may be because too much code is generated due to ObjectFIFO
     # loop unrollings. Reducing the depth to 1 here will work around that at
     # a big performance cost.
-    fifo_depth = 2
+    fifo_depth = 1
 
     A_sz_in_i32s = M * K * word_size_in // 4
     B_sz_in_i32s = K * N * word_size_in // 4
@@ -289,7 +289,7 @@ def device_body():
             for j in range(n_cols):
                 for i in range(n_rows):
                     # Compute tile i
-                    @core(cores[j][i], "mm.o")
+                    @core(cores[j][i], f"mm_{m}x{k}x{n}.o")
                     def core_body():
                         for _ in for_(0xFFFFFFFF):
                             for _ in (

From 74c59e2a1e88f24128cb6d87e434880c6967e9d6 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 12:51:26 -0700
Subject: [PATCH 03/17] [matmul] simplify verification

---
 .../basic/matrix_multiplication/common.h      | 55 ++-----------------
 1 file changed, 4 insertions(+), 51 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index 851eb8ded7..f045835698 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -116,68 +116,21 @@ static inline std::bfloat16_t random_bfloat16_t() {
   return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
 }
 
-template <typename Tin, typename Tout>
-void matmul_naive(int M, int N, int K, const std::vector<Tin> A,
-                  const std::vector<Tin> B, std::vector<Tout> &C) {
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col++) {
-      Tout running_sum = 0;
-      for (int k = 0; k < K; k++) {
-        running_sum += Tout(A[row * K + k] * B[k * N + col]);
-      }
-      C[row * N + col] = Tout(running_sum);
-    }
-  }
-}
-
 template <typename Tin, typename Tout, typename Tacc>
 void matmul(int M, int N, int K, const std::vector<Tin> A,
             const std::vector<Tin> B, std::vector<Tout> &C) {
-  // A is an  MxK matrix
-  // B is a   KxN matrix
-  // C is the MxN output matrix, assumed to be zeroed out
-
-  constexpr int K_block_size = 64;
-  const int n_K_blocks = K / K_block_size;
-
-  const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop
-                                     with this const variable. B does not get
-                                     resized, so the pointer remains valid. */
-
-  const Tin *A_base = A.data(); /* Points to start of current row of A,
-                                   monotonically increasing by K. */
-  const Tin *B_base = B_origin; /* Points to start of current column of B;
-                                   increases by 1 in each inner loop, resets
-                                   to B_origin (0) at the start of a new row
-                                   (outer loop). */
-
-  const Tin *A_ptr = A_base;
-  const Tin *B_ptr = B_base;
-  Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */
-
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
-      A_ptr = A_base;
-      B_ptr = B_base;
       Tacc running_sum = 0;
-      for (int k = 0; k < n_K_blocks; k++) {
-        for (int i = 0; i < K_block_size; i++) {
-          running_sum += Tacc(*A_ptr * *B_ptr);
-          A_ptr += 1; // Advance to right neighbor; next value in this row
-          B_ptr += N; // Advance to bottom neighbor; next value in this column
-        }
+      for (int k = 0; k < K; k++) {
+        running_sum += Tacc(A[row * K + k] * B[k * N + col]);
       }
-      *C_ptr = Tout(running_sum);
-      C_ptr += 1;
-      B_base += 1; /* Next iteration: same row of A (A_base unchanged),
-                      next column of B (B_base increases by 1) */
+      C[row * N + col] = Tout(running_sum);
     }
-    A_base += K;       // Advance to next row of A
-    B_base = B_origin; /* Next row of A means we need to restart at the first
-                          column of B. */
   }
 }
 
+
 template <typename Tin, typename Tout, typename Tacc>
 Tout mul_acc(int M, int N, int K, int row, int col, const std::vector<Tin> A,
              const std::vector<Tin> B) {

From 5af0e9d6d1b2d082ded44ccaa21b347e51b90726 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 13:45:26 -0700
Subject: [PATCH 04/17] [matmul] simplify whole_array strides to element size
 after PR #1538

---
 .../matrix_multiplication/whole_array/aie2.py | 96 +++++--------------
 1 file changed, 25 insertions(+), 71 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 022e3b975f..88e5346cc2 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -63,45 +63,13 @@ def my_matmul(M, K, N, m, k, n):
     assert k % s == 0
     assert n % t == 0
 
-    word_size_in = 2
-    word_size_out = 2
-
     # If you get errors during CDO generation due to running out of program
     # memory, it may be because too much code is generated due to ObjectFIFO
     # loop unrollings. Reducing the depth to 1 here will work around that at
     # a big performance cost.
     fifo_depth = 1
 
-    A_sz_in_i32s = M * K * word_size_in // 4
-    B_sz_in_i32s = K * N * word_size_in // 4
-    C_sz_in_bytes = M * N * word_size_out
-    C_sz_in_i32s = C_sz_in_bytes // 4
-
-    M_div_m = M // m
-    M_div_m_div_n_rows = M // (m * n_rows)
-    K_div_k = K // k
-    N_div_n = N // n
-    tiles = M_div_m * N_div_n // n_cores
-    N_div_n_div_n_cols = N_div_n // n_cols
-
-    # Matrix A: MxK, submatrices a: mxk
-    k_in_i32s = k * word_size_in // 4
-    K_in_i32s = K * word_size_in // 4
-    m_x_n_rows = m * n_rows
-
-    # Matrix B: KxN, submatrices b: kxn
-    n_in_i32s = n * word_size_in // 4
-    N_in_i32s = N * word_size_in // 4
-    k_x_N_in_i32s = k * N * word_size_in // 4
-    n_x_n_cols_in_i32s = n_in_i32s * n_cols
-
-    # Output Matrix C: MxN
-    n_in_i32s_out = n * word_size_out // 4
-    N_in_i32s_out = N * word_size_out // 4
-    m_x_n_rows_x_N_in_i32s_out = m * n_rows * N_in_i32s_out
-    n_x_n_cols_in_i32s_out = n_in_i32s_out * n_cols
-
-    vectorized = True
+    n_tiles = (M // m) * (N // n) // n_cores
 
     with mlir_mod_ctx() as ctx:
 
@@ -293,7 +261,7 @@ def device_body():
                     def core_body():
                         for _ in for_(0xFFFFFFFF):
                             for _ in (
-                                for_(tiles) if tiles > 1 else range(1)
+                                for_(n_tiles) if n_tiles > 1 else range(1)
                             ):  # Workaround for issue #1547
                                 elem_out = memC_fifos[j][memC_fifo_names[j][i]].acquire(
                                     ObjectFifoPort.Produce,
@@ -301,7 +269,7 @@ def core_body():
                                 )
                                 call(zero, [elem_out])
 
-                                for _ in for_(K_div_k):
+                                for _ in for_(K // k):
                                     elem_in_a = memA_fifos[memA_fifo_names[i]].acquire(
                                         ObjectFifoPort.Consume,
                                         1,
@@ -324,82 +292,68 @@ def core_body():
                                 )
                                 yield_([])
 
-                            if tiles > 1:  # workaround for issue #1547
+                            if n_tiles > 1:  # workaround for issue #1547
                                 yield_([])
 
             # To/from AIE-array data movement
 
             @FuncOp.from_py_func(
-                T.memref(A_sz_in_i32s, T.i32()),
-                T.memref(B_sz_in_i32s, T.i32()),
-                T.memref(C_sz_in_i32s, T.i32()),
+                T.memref(M * K, T.bf16()),
+                T.memref(K * N, T.bf16()),
+                T.memref(M *N , T.bf16()),
             )
             def sequence(A, B, C):
                 # only do 5 tile rows at a time before synchronizing, so we can reuse BDs
                 rows_per_block = 5
                 for tile_row_block in range(
-                    (M_div_m_div_n_rows + rows_per_block - 1) // rows_per_block
+                    (M // m // n_rows + rows_per_block - 1) // rows_per_block
                 ):
                     num_tile_rows = min(
                         [
                             rows_per_block,
-                            M_div_m_div_n_rows - tile_row_block * rows_per_block,
+                            M // m // n_rows - tile_row_block * rows_per_block,
                         ]
                     )
                     C_row_offset = (
-                        tile_row_block * rows_per_block * m * n_rows * N * word_size_out
+                        tile_row_block * rows_per_block * m * n_rows * N
                     )
                     for i in range(n_cols):
-                        C_col_offset = i * n * word_size_out
-                        C_offset_in_i32s = (C_col_offset + C_row_offset) // 4
+                        C_col_offset = i * n
+                        C_offset = (C_col_offset + C_row_offset) * 2
                         npu_dma_memcpy_nd(
                             metadata=outC_fifo_names[i],
                             bd_id=0,
                             mem=C,
-                            offsets=[0, 0, 0, C_offset_in_i32s],
-                            sizes=[
-                                num_tile_rows,
-                                N_div_n_div_n_cols,
-                                m_x_n_rows,
-                                n_in_i32s_out,
-                            ],
-                            strides=[
-                                m_x_n_rows_x_N_in_i32s_out,
-                                n_x_n_cols_in_i32s_out,
-                                N_in_i32s_out,
+                            offsets=[0, 0, 0, C_offset],
+                            sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n],
+                            strides=[m * n_rows * N, n * n_cols, N
                             ],
                         )
                         for tile_row in range(num_tile_rows):
-                            A_row_offset_in_i32s = (
+                            A_row_offset = (
                                 ((tile_row_block * rows_per_block) + tile_row)
                                 * n_rows
                                 * m
                                 * K
-                                * word_size_in
-                                // 4
                             )
-                            A_col_offset_in_i32s = i * m * K * word_size_in // 4
-                            B_col_offset_in_i32s = i * n * word_size_in // 4
+                            A_col_offset = i * m * K
+                            A_offset = A_row_offset + A_col_offset
+                            B_col_offset = i * n
                             npu_dma_memcpy_nd(
                                 metadata=inA_fifo_names[i],
                                 bd_id=2 * tile_row + 1,
                                 mem=A,
-                                offsets=[
-                                    0,
-                                    0,
-                                    0,
-                                    A_col_offset_in_i32s + A_row_offset_in_i32s,
-                                ],
-                                sizes=[N_div_n_div_n_cols, K_div_k, m, k_in_i32s],
-                                strides=[0, k_in_i32s, K_in_i32s],
+                                offsets=[0, 0, 0, A_offset ],
+                                sizes=[N // n // n_cols, K // k, m, k],
+                                strides=[0, k, K],
                             )
                             npu_dma_memcpy_nd(
                                 metadata=inB_fifo_names[i],
                                 bd_id=2 * tile_row + 2,
                                 mem=B,
-                                offsets=[0, 0, 0, B_col_offset_in_i32s],
-                                sizes=[N_div_n_div_n_cols, K_div_k, k, n_in_i32s],
-                                strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s],
+                                offsets=[0, 0, 0, B_col_offset],
+                                sizes=[N // n // n_cols, K // k, k, n],
+                                strides=[n * n_cols, k * N, N],
                             )
                     for i in range(n_cols):
                         npu_sync(column=i, row=0, direction=0, channel=0)

From cf43febf95f430be111c737f78cac5933003d6dc Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:17:08 -0700
Subject: [PATCH 05/17] [matmul] offsets seem to still be in bytes; fix

---
 .../basic/matrix_multiplication/single_core/aie2.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index ba312aa417..cf298e2645 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -194,7 +194,7 @@ def sequence(A, B, C):
                     for tile_row in range(num_tile_rows):
                         A_row_offset = (
                             ((tile_row_block * rows_per_block) + tile_row) * m * K
-                        )
+                        ) * 2
                         npu_dma_memcpy_nd(
                             metadata="inA",
                             bd_id=2 * tile_row + 1,

From 2529d275f6db1420816d13c2ef08c9df913d3527 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:26:42 -0700
Subject: [PATCH 06/17] [matmul] fix matrix printing error

---
 programming_examples/basic/matrix_multiplication/common.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index f045835698..d5997a616e 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -130,7 +130,6 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
   }
 }
 
-
 template <typename Tin, typename Tout, typename Tacc>
 Tout mul_acc(int M, int N, int K, int row, int col, const std::vector<Tin> A,
              const std::vector<Tin> B) {
@@ -200,7 +199,8 @@ void print_matrix(const std::vector<T> matrix, int n_cols,
   if (elide_cols) {                                                            \
     ostream << std::setw(0) << elide_sym;                                      \
   }                                                                            \
-  for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) {    \
+  for (int i = 0; i < (n_printable_cols - 1) / 2; i++) {                       \
+    int col = n_cols - (n_printable_cols - 1) / 2 + i;                         \
     ostream << std::right << std::setw(w) << (what);                           \
     ostream << std::setw(0) << col_sep;                                        \
   }
@@ -213,7 +213,8 @@ void print_matrix(const std::vector<T> matrix, int n_cols,
     print_row(elide_sym);
     ostream << std::endl;
   }
-  for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) {
+  for (int i = 0; i < (n_printable_rows - 1) / 2; i++) {
+    int row = n_rows - (n_printable_rows - 1) / 2 + i;
     print_row(matrix[row * n_cols + col]);
     ostream << std::endl;
   }

From d76ce27ad6e5b4b2dfdd592dac894495d370ed68 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:28:28 -0700
Subject: [PATCH 07/17] [matmul] allow single_core overall and tile size to be
 adjusted

---
 .../single_core/Makefile                      |  20 +++-
 .../matrix_multiplication/single_core/aie2.py | 105 ++++++++++++------
 2 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile
index ca92224df8..92f48a545a 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/Makefile
+++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile
@@ -10,12 +10,20 @@ subdir=single_core
 targetname=matrixMultiplication
 kernels=mm
 
-# Currently does not accept reconfiguring size via these variables; must change
-# in source at aie2.py as well as here
-M=256
-K=256
-N=256
+M?=256
+K?=256
+N?=256
+m?=64
+k?=64
+n?=64
+
+kernels=mm_${m}x${k}x${n}
+aieargs+=-m $m -k $k -n $n
+target_suffix=${M}x${K}x${N}_${m}x${k}x${n}
 
-SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
 include ${SELF_DIR}../makefile-common
 
+build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -DDIM_N=${n} -c $< -o ${@F}
+
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index cf298e2645..6b6a48e400 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -5,24 +5,45 @@
 #
 # (c) Copyright 2023 AMD Inc.
 
+import sys
+import argparse
+
+from aie.extras.context import mlir_mod_ctx
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.dialects.scf import *
-from aie.extras.context import mlir_mod_ctx
 import aie.utils.trace as trace_utils
 
 
-def my_matmul():
-    M = 256
-    K = 256
-    N = 256
-    m = 64
-    k = 64
-    n = 64
+def main():
+    argparser = argparse.ArgumentParser(
+        prog="AIE Matrix Multiplication MLIR Design (Whole Array)",
+        description="Emits MLIR code for a matrix multiplication design of the given input size",
+    )
+    argparser.add_argument("-M", type=int, default=256)
+    argparser.add_argument("-K", type=int, default=256)
+    argparser.add_argument("-N", type=int, default=256)
+    argparser.add_argument("-m", type=int, default=64)
+    argparser.add_argument("-k", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=64)
+    args = argparser.parse_args()
+    my_matmul(args.M, args.K, args.N, args.m, args.k, args.n)
+
+
+def my_matmul(M, K, N, m, k, n):
+
+    assert M % m == 0
+    assert K % k == 0
+    assert N % n == 0
+
     r = 4
     s = 8
     t = 4
 
+    assert m % r == 0
+    assert k % s == 0
+    assert n % t == 0
+
     vectorized = True
     enable_tracing = False
     trace_size = 65536
@@ -81,12 +102,16 @@ def device_body():
                 compute_tile2,
                 2,
                 memref_a_ty,
-                [
-                    (m // r, r * k),
-                    (k // s, s),
-                    (r, k),
-                    (s, 1),
-                ],
+                (
+                    [
+                        (m // r, r * k),
+                        (k // s, s),
+                        (r, k),
+                        (s, 1),
+                    ]
+                    if vectorized
+                    else []
+                ),
             )
             object_fifo_link(inA, memA)
 
@@ -98,12 +123,16 @@ def device_body():
                 compute_tile2,
                 2,
                 memref_b_ty,
-                [
-                    (k // s, s * n),
-                    (n // t, t),
-                    (s, n),
-                    (t, 1),
-                ],
+                (
+                    [
+                        (k // s, s * n),
+                        (n // t, t),
+                        (s, n),
+                        (t, 1),
+                    ]
+                    if vectorized
+                    else []
+                ),
             )
             object_fifo_link(inB, memB)
 
@@ -115,12 +144,16 @@ def device_body():
                 shim_tile,
                 2,
                 memref_c_ty,
-                [
-                    (m // r, r * n),
-                    (r, t),
-                    (n // t, r * t),
-                    (t, 1),
-                ],
+                (
+                    [
+                        (m // r, r * n),
+                        (r, t),
+                        (n // t, r * t),
+                        (t, 1),
+                    ]
+                    if vectorized
+                    else []
+                ),
             )
             object_fifo_link(memC, outC)
 
@@ -131,17 +164,19 @@ def device_body():
             # Set up compute tiles
 
             # Compute tile 2
-            @core(compute_tile2, "mm.o")
+            @core(compute_tile2, f"mm_{m}x{k}x{n}.o")
             def core_body():
                 for _ in for_(0xFFFFFFFF):
-                    for _ in for_(tiles):
+                    for _ in for_(tiles) if tiles > 1 else range(1):  # issue #1547
                         elem_out = memC.acquire(ObjectFifoPort.Produce, 1)
                         if vectorized:
                             call(zero, [elem_out])
                         else:
                             call(zero_scalar, [elem_out])
 
-                        for _ in for_(K_div_k):
+                        for _ in (
+                            for_(K_div_k) if K_div_k > 1 else range(1)
+                        ):  # issue #1547
                             elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1)
                             elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1)
                             if vectorized:
@@ -150,10 +185,12 @@ def core_body():
                                 call(matmul_scalar, [elem_in_a, elem_in_b, elem_out])
                             memA.release(ObjectFifoPort.Consume, 1)
                             memB.release(ObjectFifoPort.Consume, 1)
-                            yield_([])
+                            if K_div_k > 1:
+                                yield_([])
 
                         memC.release(ObjectFifoPort.Produce, 1)
-                        yield_([])
+                        if tiles > 1:
+                            yield_([])
                     yield_([])
 
             # To/from AIE-array data movement
@@ -216,4 +253,8 @@ def sequence(A, B, C):
     print(ctx.module)
 
 
-my_matmul()
+if __name__ == "__main__":
+    main()
+else:
+    print("Not meant to be imported")
+    sys.exit(1)

From 9f86e90b7d73cd8e74bc8342c134bf581eb1e402 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:28:44 -0700
Subject: [PATCH 08/17] [matmul] fix typo in makefile-common

---
 .../basic/matrix_multiplication/makefile-common               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index bd1eb11409..1e095eda07 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -38,12 +38,12 @@ N?=512
 
 trace_size?=65536
 
-target_suffix?=${M}x${K}x${n}
+target_suffix?=${M}x${K}x${N}
 mlir_target?=build/aie_${target_suffix}.mlir
 xclbin_target?=build/final_${target_suffix}.xclbin
 insts_target?=build/insts_${target_suffix}.txt
 
-runargs?=-v 1 --warmup 1 --iters 1
+runargs?=-v 2 --warmup 1 --iters 1
 aieargs+=-M $M -K $K -N $N
 
 kernels_dir=${srcdir}/../../../../aie_kernels/aie2

From 20349d9a92a614f6d9fc97867bb6089368ab5f66 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:29:38 -0700
Subject: [PATCH 09/17] [matmul] restore fifo depth to two for whole_array

---
 .../basic/matrix_multiplication/whole_array/aie2.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 88e5346cc2..17c748ad0d 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -67,7 +67,7 @@ def my_matmul(M, K, N, m, k, n):
     # memory, it may be because too much code is generated due to ObjectFIFO
     # loop unrollings. Reducing the depth to 1 here will work around that at
     # a big performance cost.
-    fifo_depth = 1
+    fifo_depth = 2
 
     n_tiles = (M // m) * (N // n) // n_cores
 

From 654413456b346edb709968d862d5ee58f2b41e2e Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:36:05 -0700
Subject: [PATCH 10/17] [matmul] express offets in bytes in whole_array design;
 reformat

---
 .../matrix_multiplication/whole_array/aie2.py     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 17c748ad0d..4adb1cd7e2 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -300,7 +300,7 @@ def core_body():
             @FuncOp.from_py_func(
                 T.memref(M * K, T.bf16()),
                 T.memref(K * N, T.bf16()),
-                T.memref(M *N , T.bf16()),
+                T.memref(M * N, T.bf16()),
             )
             def sequence(A, B, C):
                 # only do 5 tile rows at a time before synchronizing, so we can reuse BDs
@@ -314,9 +314,7 @@ def sequence(A, B, C):
                             M // m // n_rows - tile_row_block * rows_per_block,
                         ]
                     )
-                    C_row_offset = (
-                        tile_row_block * rows_per_block * m * n_rows * N
-                    )
+                    C_row_offset = tile_row_block * rows_per_block * m * n_rows * N
                     for i in range(n_cols):
                         C_col_offset = i * n
                         C_offset = (C_col_offset + C_row_offset) * 2
@@ -326,8 +324,7 @@ def sequence(A, B, C):
                             mem=C,
                             offsets=[0, 0, 0, C_offset],
                             sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n],
-                            strides=[m * n_rows * N, n * n_cols, N
-                            ],
+                            strides=[m * n_rows * N, n * n_cols, N],
                         )
                         for tile_row in range(num_tile_rows):
                             A_row_offset = (
@@ -337,13 +334,13 @@ def sequence(A, B, C):
                                 * K
                             )
                             A_col_offset = i * m * K
-                            A_offset = A_row_offset + A_col_offset
-                            B_col_offset = i * n
+                            A_offset = (A_row_offset + A_col_offset) * 2
+                            B_col_offset = i * n * 2
                             npu_dma_memcpy_nd(
                                 metadata=inA_fifo_names[i],
                                 bd_id=2 * tile_row + 1,
                                 mem=A,
-                                offsets=[0, 0, 0, A_offset ],
+                                offsets=[0, 0, 0, A_offset],
                                 sizes=[N // n // n_cols, K // k, m, k],
                                 strides=[0, k, K],
                             )

From c6974592f7536c3f012de814d72fa1089ecc1c5c Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:44:13 -0700
Subject: [PATCH 11/17] [matmul] reduce verification tolerance to 5% relative,
 0.5 absolute

---
 programming_examples/basic/matrix_multiplication/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index d5997a616e..1396a39d45 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -236,7 +236,7 @@ template <typename Tout>
 std::optional<struct error<Tout>>
 verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) {
   const float absTol = 0.5;
-  const float relTol = 0.15;
+  const float relTol = 0.05;
   if (!nearly_equal(expected, actual, relTol, absTol)) {
     return (struct error<Tout>){row, col, expected, actual};
   }

From 07c029cf76309f6797d63e2b6c8974a1a66d5fd1 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Wed, 12 Jun 2024 16:59:19 -0700
Subject: [PATCH 12/17] [matmul] format

---
 aie_kernels/aie2/mm.cc                                    | 3 ++-
 programming_examples/basic/matrix_multiplication/test.cpp | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
 mode change 100755 => 100644 aie_kernels/aie2/mm.cc

diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc
old mode 100755
new mode 100644
index 35437afb40..0444fa6018
--- a/aie_kernels/aie2/mm.cc
+++ b/aie_kernels/aie2/mm.cc
@@ -431,7 +431,8 @@ extern "C" {
                              r, s, t)                                          \
   void matmul_scalar_##mlir_type_in##_##mlir_type_out(                         \
       ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
-    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N>(a_in, b_in, c_out);\
+    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N>(a_in, b_in,        \
+                                                            c_out);            \
   }
 
 #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,              \
diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp
index e3786b0d2a..c838f30aeb 100644
--- a/programming_examples/basic/matrix_multiplication/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/test.cpp
@@ -140,6 +140,7 @@ int main(int argc, const char *argv[]) {
   std::vector<A_DATATYPE> AVec(A_VOLUME);
   for (int i = 0; i < A_VOLUME; i++) {
     AVec[i] = matmul_common::random_bfloat16_t();
+    // AVec[i] = i;
   }
   memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE)));
   B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();

From 1dc7a6adddb3fc8a09a1c2e0ca88d377d5afd003 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Thu, 13 Jun 2024 08:27:50 -0700
Subject: [PATCH 13/17] [matmul] fix CI test errors

---
 .../basic/matrix_multiplication/matrix_vector/test.cpp         | 1 +
 .../basic/matrix_multiplication/single_core/Makefile           | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
index 000d47499c..862256fa15 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
@@ -14,5 +14,6 @@
 using A_DATATYPE = std::bfloat16_t;
 using B_DATATYPE = std::bfloat16_t;
 using C_DATATYPE = float;
+using ACC_DATATYPE = float;
 
 #include "../test.cpp"
diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile
index 92f48a545a..1142d7357e 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/Makefile
+++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile
@@ -6,6 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 subdir=single_core
 targetname=matrixMultiplication
 kernels=mm
@@ -21,7 +22,7 @@ kernels=mm_${m}x${k}x${n}
 aieargs+=-m $m -k $k -n $n
 target_suffix=${M}x${K}x${N}_${m}x${k}x${n}
 
-include ${SELF_DIR}../makefile-common
+include ${srcdir}/../makefile-common
 
 build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc
 	mkdir -p ${@D}

From 762d866b2bc1cb7a68e8402179785e41b3ad255b Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Fri, 14 Jun 2024 14:37:58 -0700
Subject: [PATCH 14/17] [matmul] fix matrix printing rounding error

---
 .../basic/matrix_multiplication/common.h               | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index 1396a39d45..b1a7e92347 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -171,7 +171,7 @@ void print_matrix(const std::vector<T> matrix, int n_cols,
   assert(matrix.size() % n_cols == 0);
 
   auto maxima = std::minmax_element(matrix.begin(), matrix.end());
-  T max_val = std::max(*maxima.first, std::abs(*maxima.second));
+  T max_val = std::max(*maxima.first, (T)std::abs(*maxima.second));
   size_t n_digits = log10(max_val);
   if (w == -1) {
     w = n_digits;
@@ -199,8 +199,8 @@ void print_matrix(const std::vector<T> matrix, int n_cols,
   if (elide_cols) {                                                            \
     ostream << std::setw(0) << elide_sym;                                      \
   }                                                                            \
-  for (int i = 0; i < (n_printable_cols - 1) / 2; i++) {                       \
-    int col = n_cols - (n_printable_cols - 1) / 2 + i;                         \
+  for (int i = 0; i < n_printable_cols / 2; i++) {                             \
+    int col = n_cols - n_printable_cols / 2 + i;                               \
     ostream << std::right << std::setw(w) << (what);                           \
     ostream << std::setw(0) << col_sep;                                        \
   }
@@ -213,8 +213,8 @@ void print_matrix(const std::vector<T> matrix, int n_cols,
     print_row(elide_sym);
     ostream << std::endl;
   }
-  for (int i = 0; i < (n_printable_rows - 1) / 2; i++) {
-    int row = n_rows - (n_printable_rows - 1) / 2 + i;
+  for (int i = 0; i < n_printable_rows / 2; i++) {
+    int row = n_rows - n_printable_rows / 2 + i;
     print_row(matrix[row * n_cols + col]);
     ostream << std::endl;
   }

From cfe3bbf6cc71d55f065dc8d961f9fe75cfe80f8f Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Fri, 14 Jun 2024 14:49:30 -0700
Subject: [PATCH 15/17] [matvec] use integers to avoid float errors; swap in
 scalar kernel for now to pass verification

---
 aie_kernels/aie2/mv.cc                        | 36 +++++++----
 .../matrix_vector/Makefile                    |  9 ++-
 .../matrix_vector/aie2.py                     | 60 ++++++++++++-------
 .../matrix_vector/test.cpp                    |  9 +--
 4 files changed, 78 insertions(+), 36 deletions(-)

diff --git a/aie_kernels/aie2/mv.cc b/aie_kernels/aie2/mv.cc
index 7ac7903c53..42829ffd6b 100644
--- a/aie_kernels/aie2/mv.cc
+++ b/aie_kernels/aie2/mv.cc
@@ -45,7 +45,8 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b,
   static_assert(s == 8); // s is fixed to 8 because that is the number of
                          // column vectors (a_vec_0_0..a_vec_3_1) we create
   static_assert(k % s == 0);
-  static_assert(std::is_same<T_in, bfloat16>::value);
+  static_assert(std::is_same<T_in, bfloat16>::value ||
+                std::is_same<T_in, int16_t>::value);
 
   // This kernel expects a "32-bit word transposed matrix", i.e. the result
   // of transposing the row-major representation of the matrix at a
@@ -80,12 +81,12 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b,
         aie::accum<T_acc, r> c_acc_in;
         c_acc_in.from_vector(aie::load_v<r>(c_ptr));
 
-        const aie::vector<T_in, 2 * r> a_vec_0 = aie::load_v<2 * r>(a_ptr);
-        const aie::vector<T_in, 2 * r> a_vec_1 =
+        const aie::vector<T_in, 2 *r> a_vec_0 = aie::load_v<2 * r>(a_ptr);
+        const aie::vector<T_in, 2 *r> a_vec_1 =
             aie::load_v<2 * r>(a_ptr + 2 * m);
-        const aie::vector<T_in, 2 * r> a_vec_2 =
+        const aie::vector<T_in, 2 *r> a_vec_2 =
             aie::load_v<2 * r>(a_ptr + 4 * m);
-        const aie::vector<T_in, 2 * r> a_vec_3 =
+        const aie::vector<T_in, 2 *r> a_vec_3 =
             aie::load_v<2 * r>(a_ptr + 6 * m);
 
         // The even/odd calls below extract the interleaved columns of A.
@@ -133,35 +134,48 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b,
 
 extern "C" {
 
+// If you want to compile microkernels with different inner tile sizes,
+// define DIM_M and DIM_K at compile time using -DDIM_M 16 etc.
+// These dimensions must be divisible by the r, s dimensions used in
+// the kernels.
+
+#ifndef DIM_M
+#define DIM_M 32
+#endif
+
+#ifndef DIM_K
+#define DIM_K 32
+#endif
+
 #define combos(X)                                                              \
-  X(bfloat16, bf16, float, f32, accfloat)                                      \
-//    X(int16,         i16, int16,    i16, acc32)                                  \
+  /* X(bfloat16, bf16, float, f32, accfloat) */                                \
+  X(int16, i16, int32, i32, acc32)
 
 #define matvec_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \
                              ctype_acc)                                        \
   void matvec_scalar_##mlir_type_in##_##mlir_type_out(                         \
       ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
-    matvec_scalar<ctype_in, ctype_out, 32, 32>(a_in, b_in, c_out);             \
+    matvec_scalar<ctype_in, ctype_out, DIM_M, DIM_K>(a_in, b_in, c_out);       \
   }
 
 #define matvec_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,            \
                                  mlir_type_out, ctype_acc)                     \
   void matvec_vectorized_##mlir_type_in##_##mlir_type_out(                     \
       ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
-    matvec_vectorized<ctype_in, ctype_out, ctype_acc, 32, 32, 16, 8>(          \
+    matvec_vectorized<ctype_in, ctype_out, ctype_acc, DIM_M, DIM_K, 16, 8>(    \
         a_in, b_in, c_out);                                                    \
   }
 
 #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,              \
                                mlir_type_out, ctype_acc)                       \
   void zero_vectorized_##mlir_type_out(ctype_out *c_out) {                     \
-    zero_vectorized<ctype_out, 32, 1, 32>(c_out);                              \
+    zero_vectorized<ctype_out, DIM_M, 1, 32>(c_out);                           \
   }
 
 #define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out,   \
                            ctype_acc)                                          \
   void zero_scalar_##mlir_type_out(ctype_out *c_out) {                         \
-    zero_scalar<ctype_out, 32, 1>(c_out);                                      \
+    zero_scalar<ctype_out, DIM_M, 1>(c_out);                                   \
   }
 
 combos(matvec_scalar_c_func) combos(matvec_vectorized_c_func)
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
index c701ce9a50..c86e4761af 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
@@ -8,14 +8,21 @@
 
 subdir=matrix_vector
 targetname=matrixVectorMultiplication
-kernels=mv
 
 # Currently does not accept reconfiguring size via these variables; must change
 # in source at aie2.py as well as here
 M=288
 K=288
 N=1
+m=32
+k=32
+
+kernels=mv_${m}x${k}
 
 SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
 include ${SELF_DIR}../makefile-common
 
+build/mv_${m}x${k}.o: ${kernels_dir}/mv.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -c $< -o ${@F}
+
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
index 54276121c8..0657d2fb10 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -32,26 +32,36 @@ def my_matmul():
     m_x_k = m * k
     m_x_K = m * K
 
-    vectorized = True
+    # FIXME vectorized kernel is currently erroneous
+    vectorized = False
+
+    dtype_in = T.i16
+    dtype_in_str = "i16"
+    dtype_out = T.i32
+    dtype_out_str = "i32"
 
     with mlir_mod_ctx() as ctx:
 
         @device(AIEDevice.npu1_4col)
         def device_body():
-            memRef_inA_ty = T.memref(m * k, T.bf16())
-            memRef_inB_ty = T.memref(k, T.bf16())
-            memRef_outC_ty = T.memref(m, T.f32())
-            memRef_A_ty = T.memref(m, k, T.bf16())
+            memRef_inA_ty = T.memref(m * k, dtype_in())
+            memRef_inB_ty = T.memref(k, dtype_in())
+            memRef_outC_ty = T.memref(m, dtype_out())
+            memRef_A_ty = T.memref(m, k, dtype_in())
 
             # AIE Core Function declarations
-            zero_scalar = external_func("zero_scalar_f32", inputs=[memRef_outC_ty])
-            zero = external_func("zero_vectorized_f32", inputs=[memRef_outC_ty])
+            zero_scalar = external_func(
+                f"zero_scalar_{dtype_out_str}", inputs=[memRef_outC_ty]
+            )
+            zero = external_func(
+                f"zero_vectorized_{dtype_out_str}", inputs=[memRef_outC_ty]
+            )
             matvec_scalar = external_func(
-                "matvec_scalar_bf16_f32",
+                f"matvec_scalar_{dtype_in_str}_{dtype_out_str}",
                 inputs=[memRef_A_ty, memRef_inB_ty, memRef_outC_ty],
             )
             matvec = external_func(
-                "matvec_vectorized_bf16_f32",
+                f"matvec_vectorized_{dtype_in_str}_{dtype_out_str}",
                 inputs=[memRef_A_ty, memRef_inB_ty, memRef_outC_ty],
             )
 
@@ -96,11 +106,15 @@ def device_body():
                     cores[i],
                     2,
                     memRef_A_ty,
-                    [
-                        (k // 2 // 2, 2),
-                        (m, k),
-                        (2, 1),
-                    ],  # transpose at 4-byte (2xbf16) granularity
+                    (
+                        [
+                            (k // 2 // 2, 2),
+                            (m, k),
+                            (2, 1),
+                        ]
+                        if vectorized
+                        else []
+                    ),  # transpose at 4-byte (2xbf16) granularity
                 )
                 object_fifo_link(
                     memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]]
@@ -128,14 +142,17 @@ def device_body():
             # Set up compute tiles
             for i in range(n_cores):
                 # Compute tile i
-                @core(cores[i], "mv.o")
+                @core(cores[i], f"mv_{m}x{k}.o")
                 def core_body():
                     for _ in for_(0xFFFFFFFF):
                         elem_out = outC_fifos[outC_fifo_names[i]].acquire(
                             ObjectFifoPort.Produce,
                             1,
                         )
-                        call(zero, [elem_out])
+                        if vectorized or True:
+                            call(zero, [elem_out])
+                        else:
+                            call(zero_scalar, [elem_out])
 
                         for _ in for_(K_div_k):
                             elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
@@ -146,7 +163,10 @@ def core_body():
                                 ObjectFifoPort.Consume,
                                 1,
                             )
-                            call(matvec, [elem_in_a, elem_in_b, elem_out])
+                            if vectorized:
+                                call(matvec, [elem_in_a, elem_in_b, elem_out])
+                            else:
+                                call(matvec_scalar, [elem_in_a, elem_in_b, elem_out])
                             inA_fifos[inA_fifo_names[i]].release(
                                 ObjectFifoPort.Consume,
                                 1,
@@ -166,9 +186,9 @@ def core_body():
             # To/from AIE-array data movement
 
             @FuncOp.from_py_func(
-                T.memref(A_sz, T.bf16()),
-                T.memref(B_sz, T.bf16()),
-                T.memref(C_sz, T.f32()),
+                T.memref(A_sz, dtype_in()),
+                T.memref(B_sz, dtype_in()),
+                T.memref(C_sz, dtype_out()),
             )
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
index 862256fa15..eb41adafa3 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
@@ -9,11 +9,12 @@
 //===----------------------------------------------------------------------===//
 
 #include <stdfloat>
+#include <stdint.h>
 
 #define DATATYPES_USING_DEFINED
-using A_DATATYPE = std::bfloat16_t;
-using B_DATATYPE = std::bfloat16_t;
-using C_DATATYPE = float;
-using ACC_DATATYPE = float;
+using A_DATATYPE = int16_t; // std::bfloat16_t;
+using B_DATATYPE = int16_t; // std::bfloat16_t;
+using C_DATATYPE = int32_t; // float;
+using ACC_DATATYPE = int32_t;
 
 #include "../test.cpp"

From df84e5bae4cb0a03b1ac542bd03b305620e1120b Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Sun, 23 Jun 2024 13:01:30 -0700
Subject: [PATCH 16/17] [matmul] add missing includes to common.h to make it
 work standalone

---
 programming_examples/basic/matrix_multiplication/common.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index b1a7e92347..67518b841b 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -19,6 +19,9 @@
 #include <cmath>
 #include <optional>
 #include <ostream>
+#include <fstream>
+#include <stdfloat>
+#include <bits/stdc++.h>
 
 namespace matmul_common {
 

From 828b13da810867fecb00ed40a670cffaa13f9392 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Sun, 23 Jun 2024 21:06:56 -0700
Subject: [PATCH 17/17] format

---
 aie_kernels/aie2/mv.cc                                    | 8 ++++----
 programming_examples/basic/matrix_multiplication/common.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/aie_kernels/aie2/mv.cc b/aie_kernels/aie2/mv.cc
index 42829ffd6b..42c9fc4b2d 100644
--- a/aie_kernels/aie2/mv.cc
+++ b/aie_kernels/aie2/mv.cc
@@ -81,12 +81,12 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b,
         aie::accum<T_acc, r> c_acc_in;
         c_acc_in.from_vector(aie::load_v<r>(c_ptr));
 
-        const aie::vector<T_in, 2 *r> a_vec_0 = aie::load_v<2 * r>(a_ptr);
-        const aie::vector<T_in, 2 *r> a_vec_1 =
+        const aie::vector<T_in, 2 * r> a_vec_0 = aie::load_v<2 * r>(a_ptr);
+        const aie::vector<T_in, 2 * r> a_vec_1 =
             aie::load_v<2 * r>(a_ptr + 2 * m);
-        const aie::vector<T_in, 2 *r> a_vec_2 =
+        const aie::vector<T_in, 2 * r> a_vec_2 =
             aie::load_v<2 * r>(a_ptr + 4 * m);
-        const aie::vector<T_in, 2 *r> a_vec_3 =
+        const aie::vector<T_in, 2 * r> a_vec_3 =
             aie::load_v<2 * r>(a_ptr + 6 * m);
 
         // The even/odd calls below extract the interleaved columns of A.
diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index 67518b841b..b2c6c14b53 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -15,13 +15,13 @@
 #define MATRIX_MULTIPLICATION_H
 
 #include <algorithm>
+#include <bits/stdc++.h>
 #include <boost/program_options.hpp>
 #include <cmath>
+#include <fstream>
 #include <optional>
 #include <ostream>
-#include <fstream>
 #include <stdfloat>
-#include <bits/stdc++.h>
 
 namespace matmul_common {