From 154581bf25ee13bd79efe54d0870a416d79c9e55 Mon Sep 17 00:00:00 2001 From: andrej Date: Tue, 11 Jun 2024 16:20:08 -0700 Subject: [PATCH 01/17] [matmul] work around object fifo bug; accumulate in float for verification; probabilistically verify beyond threshold size; add assertions and comments; improve verification output; add workaround comment for excessive program size --- .../basic/matrix_multiplication/common.h | 47 +++++++++++----- .../basic/matrix_multiplication/test.cpp | 54 ++++++++++++++---- .../matrix_multiplication/whole_array/aie2.py | 55 +++++++++++++++---- 3 files changed, 122 insertions(+), 34 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index b7386f268c..851eb8ded7 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -130,7 +130,7 @@ void matmul_naive(int M, int N, int K, const std::vector A, } } -template +template void matmul(int M, int N, int K, const std::vector A, const std::vector B, std::vector &C) { // A is an MxK matrix @@ -159,10 +159,10 @@ void matmul(int M, int N, int K, const std::vector A, for (int col = 0; col < N; col++) { A_ptr = A_base; B_ptr = B_base; - Tout running_sum = 0; + Tacc running_sum = 0; for (int k = 0; k < n_K_blocks; k++) { for (int i = 0; i < K_block_size; i++) { - running_sum += Tout(*A_ptr) * Tout(*B_ptr); + running_sum += Tacc(*A_ptr * *B_ptr); A_ptr += 1; // Advance to right neighbor; next value in this row B_ptr += N; // Advance to bottom neighbor; next value in this column } @@ -178,14 +178,14 @@ void matmul(int M, int N, int K, const std::vector A, } } -template +template Tout mul_acc(int M, int N, int K, int row, int col, const std::vector A, const std::vector B) { - Tout running_sum = 0; + Tacc running_sum = 0; for (int k = 0; k < K; k++) { - running_sum += Tout(A[row * K + k] * B[k * N + col]); + running_sum += Tacc(A[row * K + k] * B[k * N + col]); } - return running_sum; + return (Tout)running_sum; } // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0 @@ -291,7 +291,8 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) { template void print_error_summary(std::ostream &os, int n_errors, - std::vector> &errors) { + std::vector> &errors, + Tout max_rel_error) { for (struct error &err : errors) { os << "[" << std::setw(5) << err.row << ", " << std::setw(5) << err.col << "] " << std::setw(4) << std::setprecision(2) << std::fixed @@ -302,6 +303,10 @@ void print_error_summary(std::ostream &os, int n_errors, os << "...and " << std::setw(0) << n_errors - max_printable_errors << " further errors." << std::endl; } + if (n_errors > 0) { + os << "Maximum relative error: " << std::setw(3) << std::setprecision(0) + << max_rel_error * 100 << "%" << std::endl; + } } void print_progress_bar(std::ostream &os, double progress, int len = 75) { @@ -311,14 +316,15 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) { << "\r"; } -template +template int verify(int M, int N, int K, std::vector A, std::vector B, std::vector C, int verbosity = 0) { int n_errors = 0; std::vector> errors; + Tout max_rel_error = (Tout)0.0f; std::vector CRef(M * N); - matmul(M, N, K, A, B, CRef); + matmul(M, N, K, A, B, CRef); for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++) { @@ -328,11 +334,17 @@ int verify(int M, int N, int K, std::vector A, std::vector B, if (n_errors < max_printable_errors) { errors.push_back(*error); } + Tout rel_error = + std::abs(error->actual - error->expected) / + std::max(std::abs(error->actual), std::abs(error->expected)); + if (rel_error > max_rel_error) { + max_rel_error = rel_error; + } n_errors++; } } } - print_error_summary(std::cout, n_errors, errors); + print_error_summary(std::cout, n_errors, errors, max_rel_error); if (n_errors > 0) { std::cout << std::endl << "Reference:" << std::endl; @@ -344,7 +356,7 @@ int verify(int M, int N, int K, std::vector A, std::vector B, return n_errors; } -template +template int verify_stochastic(int M, int N, int K, std::vector A, std::vector B, std::vector C, int n_samples, int verbosity = 0) { @@ -359,6 +371,7 @@ int verify_stochastic(int M, int N, int K, std::vector A, int n_errors = 0; std::vector> errors; + Tout max_rel_error = (Tout)0.0f; double progress = 0; for (std::tuple> cell : std::views::enumerate(std::views::zip(sampled_rows, sampled_cols))) { @@ -371,19 +384,25 @@ int verify_stochastic(int M, int N, int K, std::vector A, progress = (double)i / n_samples; print_progress_bar(std::cerr, progress); } - Tout ref = mul_acc(M, N, K, row, col, A, B); + Tout ref = mul_acc(M, N, K, row, col, A, B); std::optional> error = verify_single(std::cout, row, col, ref, C[row * N + col]); if (error.has_value()) { if (n_errors < max_printable_errors) { errors.push_back(*error); } + Tout rel_error = + std::abs(error->actual - error->expected) / + std::max(std::abs(error->actual), std::abs(error->expected)); + if (rel_error > max_rel_error) { + max_rel_error = rel_error; + } n_errors++; } } std::cout << std::endl; - print_error_summary(std::cout, n_errors, errors); + print_error_summary(std::cout, n_errors, errors, max_rel_error); return n_errors; } diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index fded6f0de7..e3786b0d2a 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -31,8 +31,12 @@ using A_DATATYPE = std::bfloat16_t; using B_DATATYPE = std::bfloat16_t; using C_DATATYPE = std::bfloat16_t; +using ACC_DATATYPE = float; #endif +constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024; +constexpr int verify_stochastic_n_samples = 1000; + namespace po = boost::program_options; int main(int argc, const char *argv[]) { @@ -54,6 +58,8 @@ int main(int argc, const char *argv[]) { int M = vm["M"].as(); int K = vm["K"].as(); int N = vm["N"].as(); + bool do_verify_stochastic = + (long long)M * N * K > verify_stochastic_threshold; if (verbosity >= 1) { std::cout << "Matrix size " << M << "x" << K << "x" << N << std::endl; @@ -140,17 +146,26 @@ int main(int argc, const char *argv[]) { std::vector BVec(B_VOLUME); for (int i = 0; i < B_VOLUME; i++) { BVec[i] = matmul_common::random_bfloat16_t(); + // Diagonal: + // if(i % N == i / N) { + // BVec[i] = 1.0; + // } else { + // BVec[i] = 0.0; + // } } memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); // Initialize outputs; bufOut is results matrix plus tracing info char *bufOut = bo_out.map(); std::vector CVec(C_VOLUME); - // memcpy(bufOut, CVec.data(), (CVec.size() * sizeof(C_DATATYPE))); memset(bufOut, 0, OUT_SIZE); - // if(trace_size > 0) { - // memset(bufOut + C_SIZE, 0, trace_size); - // } + + if (verbosity >= 2) { + std::cout << "A = \n"; + matmul_common::print_matrix(AVec, K); + std::cout << "B = \n"; + matmul_common::print_matrix(BVec, N); + } // Instruction buffer for DMA configuration void *bufInstr = bo_instr.map(); @@ -172,14 +187,14 @@ int main(int argc, const char *argv[]) { for (unsigned iter = 0; iter < num_iter; iter++) { if (verbosity >= 1) { - std::cout << "Running Kernel.\n"; + std::cout << "Running Kernel (iteration " << iter << ").\n"; } auto start = std::chrono::high_resolution_clock::now(); unsigned int opcode = 3; auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_out); ert_cmd_state r = run.wait(); if (r != ERT_CMD_STATE_COMPLETED) { - std::cout << "kernel did not complete. returned status: " << r << "\n"; + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; return 1; } auto stop = std::chrono::high_resolution_clock::now(); @@ -193,16 +208,29 @@ int main(int argc, const char *argv[]) { memcpy(CVec.data(), bufOut, (CVec.size() * sizeof(C_DATATYPE))); if (do_verify) { if (verbosity >= 1) { - std::cout << "Verifying against reference matmul ..." << std::endl; + if (do_verify_stochastic) { + std::cout << "Verifying " << verify_stochastic_n_samples + << " random samples against reference matmul ..." + << std::endl; + } else { + std::cout << "Verifying against reference matmul ..." << std::endl; + } } auto vstart = std::chrono::system_clock::now(); - errors = matmul_common::verify(M, N, K, AVec, BVec, CVec); + if (do_verify_stochastic) { + errors = matmul_common::verify_stochastic( + M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity); + } else { + errors = matmul_common::verify( + M, N, K, AVec, BVec, CVec); + } auto vstop = std::chrono::system_clock::now(); float vtime = std::chrono::duration_cast(vstop - vstart) .count(); if (verbosity >= 1) { - std::cout << "Verify time: " << vtime << "secs." << std::endl; + std::cout << "Verify time: " << vtime << " s." << std::endl; } } else { if (verbosity >= 1) @@ -241,7 +269,13 @@ int main(int argc, const char *argv[]) { std::cout << "\nPASS!\n\n"; return 0; } else { - std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nError count: " << errors; + if (do_verify_stochastic) { + std::cout << " (out of " << verify_stochastic_n_samples + << " random samples)"; + } + std::cout << "\n\n"; + std::cout << "\nFailed.\n\n"; return 1; } diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 76453f4b94..139d14fd12 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -34,13 +34,44 @@ def my_matmul(M=512, K=512, N=512): r = 4 s = 8 t = 4 - word_size_in = 2 - word_size_out = 2 n_rows = 4 n_cols = 4 n_cores = n_rows * n_cols + # Input matrix A: + # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These + # blocks are _broadcast_ across AIE core columns, then _distributed_ across + # rows, s.t. each of the n_rows compute cores in a column receives a + # contiguous (m, k)-sized block of A. + assert ( + M % (m * n_rows) == 0 + ), """A must be tileable into (m * n_rows, k)-sized blocks""" + + # Both A and B are tiled in the K dimension into size k. + assert K % k == 0 + + # Input matrix B: + # Conceptually, we do the same as with A, but instead of broadcasting + # across columns we broadcast across rows and distribute across columns. + assert ( + N % (n * n_cols) == 0 + ), """B must be tileable into (k, n * n_cols)-sized blocks""" + + # r, s, t are the dimensions required by the microkernel MAC instructions. + assert m % r == 0 + assert k % s == 0 + assert n % t == 0 + + word_size_in = 2 + word_size_out = 2 + + # If you get errors during CDO generation due to running out of program + # memory, it may be because too much code is generated due to ObjectFIFO + # loop unrollings. Reducing the depth to 1 here will work around that at + # a big performance cost. + fifo_depth = 2 + A_sz_in_i32s = M * K * word_size_in // 4 B_sz_in_i32s = K * N * word_size_in // 4 C_sz_in_bytes = M * N * word_size_out @@ -187,14 +218,14 @@ def device_body(): inA_fifo_names[i], shims[i], mems[i], - 2, + fifo_depth, memRef_inA_ty, ) memA_fifos[memA_fifo_names[i]] = object_fifo( memA_fifo_names[i], mems[i], t_cores[i][0:n_cols], - 2, + fifo_depth, memRef_A_ty, [ (m // r, r * k), @@ -211,14 +242,14 @@ def device_body(): inB_fifo_names[i], shims[i], mems[i], - 2, + fifo_depth, memRef_inB_ty, ) memB_fifos[memB_fifo_names[i]] = object_fifo( memB_fifo_names[i], mems[i], cores[i][0:n_rows], - 2, + fifo_depth, memRef_B_ty, [ (k // s, s * n), @@ -236,14 +267,14 @@ def device_body(): memC_fifo_names[i][j], cores[i][j], mems[i], - 2, + fifo_depth, memRef_C_ty, ) outC_fifos[outC_fifo_names[i]] = object_fifo( outC_fifo_names[i], mems[i], shims[i], - 2, + fifo_depth, memRef_outC_ty, [ (m // r, r * n), @@ -261,7 +292,9 @@ def device_body(): @core(cores[j][i], "mm.o") def core_body(): for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in ( + for_(tiles) if tiles > 1 else range(1) + ): # Workaround for issue #1547 elem_out = memC_fifos[j][memC_fifo_names[j][i]].acquire( ObjectFifoPort.Produce, 1, @@ -290,7 +323,9 @@ def core_body(): ObjectFifoPort.Produce, 1 ) yield_([]) - yield_([]) + + if tiles > 1: # workaround for issue #1547 + yield_([]) # To/from AIE-array data movement From 08e404a3b11f1a5dab67950a1f256915181a5c1d Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 12:51:02 -0700 Subject: [PATCH 02/17] [matmul] allow modifiable tile size for whole_array tmp --- aie_kernels/aie2/mm.cc | 37 ++++++++++++++----- .../matrix_multiplication/makefile-common | 15 ++++---- .../whole_array/Makefile | 13 ++++++- .../matrix_multiplication/whole_array/aie2.py | 14 +++---- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc index 8b7732fdaf..35437afb40 100755 --- a/aie_kernels/aie2/mm.cc +++ b/aie_kernels/aie2/mm.cc @@ -24,16 +24,16 @@ #include "zero.cc" -template +template void matmul_scalar(T_in *a, T_in *b, T_out *c) { event0(); - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col++) { + for (int row = 0; row < rowA; row++) { + for (int col = 0; col < colB; col++) { T_out running_sum = 0; - for (int i = 0; i < K; i++) { - running_sum += a[row * K + i] * b[i * N + col]; + for (int i = 0; i < colA; i++) { + running_sum += a[row * colA + i] * b[i * colB + col]; } - c[row * N + col] += running_sum; + c[row * colB + col] += running_sum; } } event1(); @@ -397,6 +397,23 @@ void matmul_vectorized_4x8x4_bf16_f32(const bfloat16 *__restrict pA, extern "C" { +// If you want to compile microkernels with different inner tile sizes, +// define DIM_M, DIM_K and DIM_N at compile time using -DDIM_M 32 etc. +// These dimensions must be divisible by the r, s, t dimensions used in +// the kernels. + +#ifndef DIM_M +#define DIM_M 64 +#endif + +#ifndef DIM_K +#define DIM_K 64 +#endif + +#ifndef DIM_N +#define DIM_N 64 +#endif + #define combos(X) \ X(int16, i16, int16, i16, 4, 4, 4) \ X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) \ @@ -407,26 +424,26 @@ extern "C" { void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \ ctype_out *c_out) { \ matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out< \ - 64, 64, 64>(a_in, b_in, c_out); \ + DIM_M, DIM_K, DIM_N>(a_in, b_in, c_out); \ } #define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ r, s, t) \ void matmul_scalar_##mlir_type_in##_##mlir_type_out( \ ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \ - matmul_scalar(a_in, b_in, c_out); \ + matmul_scalar(a_in, b_in, c_out);\ } #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ mlir_type_out, r, s, t) \ void zero_##mlir_type_out(ctype_out *c_out) { \ - zero_vectorized(c_out); \ + zero_vectorized(c_out); \ } #define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ r, s, t) \ void zero_scalar_##mlir_type_out(ctype_out *c_out) { \ - zero_scalar(c_out); \ + zero_scalar(c_out); \ } combos(matmul_vectorized_c_func) combos(matmul_scalar_c_func) diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index e92f4e699c..bd1eb11409 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -28,10 +28,7 @@ # N=1 for matrix-vector srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) -#include ${CURDIR}/../../makefile-common current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -#include ${current_dir}../../makefile-common -SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST))) include ${current_dir}../../makefile-common # defaults; overwrite if needed @@ -39,13 +36,15 @@ M?=512 K?=512 N?=512 -trace_size=65536 +trace_size?=65536 -mlir_target?=build/aie_${M}x${K}x${N}.mlir -xclbin_target?=build/final_${M}x${K}x${N}.xclbin -insts_target?=build/insts_${M}x${K}x${N}.txt +target_suffix?=${M}x${K}x${n} +mlir_target?=build/aie_${target_suffix}.mlir +xclbin_target?=build/final_${target_suffix}.xclbin +insts_target?=build/insts_${target_suffix}.txt runargs?=-v 1 --warmup 1 --iters 1 +aieargs+=-M $M -K $K -N $N kernels_dir=${srcdir}/../../../../aie_kernels/aie2 @@ -58,7 +57,7 @@ build/%.o: ${kernels_dir}/%.cc ${mlir_target}: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< -M $M -K $K -N $N > $@ + python3 $< ${aieargs} > $@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o} mkdir -p ${@D} diff --git a/programming_examples/basic/matrix_multiplication/whole_array/Makefile b/programming_examples/basic/matrix_multiplication/whole_array/Makefile index 2289d762c6..617c76e975 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/Makefile +++ b/programming_examples/basic/matrix_multiplication/whole_array/Makefile @@ -6,13 +6,22 @@ # ##===----------------------------------------------------------------------===## srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) - subdir=whole_array targetname=matrixMultiplication -kernels=mm M?=512 K?=512 N?=512 +m?=64 +k?=64 +n?=64 + +kernels=mm_${m}x${k}x${n} +aieargs+=-m $m -k $k -n $n +target_suffix=${M}x${K}x${N}_${m}x${k}x${n} include ${srcdir}/../makefile-common + +build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -DDIM_N=${n} -c $< -o ${@F} diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 139d14fd12..022e3b975f 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -23,14 +23,14 @@ def main(): argparser.add_argument("-M", type=int, default=512) argparser.add_argument("-K", type=int, default=512) argparser.add_argument("-N", type=int, default=512) + argparser.add_argument("-m", type=int, default=64) + argparser.add_argument("-k", type=int, default=64) + argparser.add_argument("-n", type=int, default=64) args = argparser.parse_args() - my_matmul(args.M, args.K, args.N) + my_matmul(args.M, args.K, args.N, args.m, args.k, args.n) -def my_matmul(M=512, K=512, N=512): - m = 64 - k = 64 - n = 64 +def my_matmul(M, K, N, m, k, n): r = 4 s = 8 t = 4 @@ -70,7 +70,7 @@ def my_matmul(M=512, K=512, N=512): # memory, it may be because too much code is generated due to ObjectFIFO # loop unrollings. Reducing the depth to 1 here will work around that at # a big performance cost. - fifo_depth = 2 + fifo_depth = 1 A_sz_in_i32s = M * K * word_size_in // 4 B_sz_in_i32s = K * N * word_size_in // 4 @@ -289,7 +289,7 @@ def device_body(): for j in range(n_cols): for i in range(n_rows): # Compute tile i - @core(cores[j][i], "mm.o") + @core(cores[j][i], f"mm_{m}x{k}x{n}.o") def core_body(): for _ in for_(0xFFFFFFFF): for _ in ( From 74c59e2a1e88f24128cb6d87e434880c6967e9d6 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 12:51:26 -0700 Subject: [PATCH 03/17] [matmul] simplify verification --- .../basic/matrix_multiplication/common.h | 55 ++----------------- 1 file changed, 4 insertions(+), 51 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index 851eb8ded7..f045835698 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -116,68 +116,21 @@ static inline std::bfloat16_t random_bfloat16_t() { return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); } -template -void matmul_naive(int M, int N, int K, const std::vector A, - const std::vector B, std::vector &C) { - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col++) { - Tout running_sum = 0; - for (int k = 0; k < K; k++) { - running_sum += Tout(A[row * K + k] * B[k * N + col]); - } - C[row * N + col] = Tout(running_sum); - } - } -} - template void matmul(int M, int N, int K, const std::vector A, const std::vector B, std::vector &C) { - // A is an MxK matrix - // B is a KxN matrix - // C is the MxN output matrix, assumed to be zeroed out - - constexpr int K_block_size = 64; - const int n_K_blocks = K / K_block_size; - - const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop - with this const variable. B does not get - resized, so the pointer remains valid. */ - - const Tin *A_base = A.data(); /* Points to start of current row of A, - monotonically increasing by K. */ - const Tin *B_base = B_origin; /* Points to start of current column of B; - increases by 1 in each inner loop, resets - to B_origin (0) at the start of a new row - (outer loop). */ - - const Tin *A_ptr = A_base; - const Tin *B_ptr = B_base; - Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */ - for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++) { - A_ptr = A_base; - B_ptr = B_base; Tacc running_sum = 0; - for (int k = 0; k < n_K_blocks; k++) { - for (int i = 0; i < K_block_size; i++) { - running_sum += Tacc(*A_ptr * *B_ptr); - A_ptr += 1; // Advance to right neighbor; next value in this row - B_ptr += N; // Advance to bottom neighbor; next value in this column - } + for (int k = 0; k < K; k++) { + running_sum += Tacc(A[row * K + k] * B[k * N + col]); } - *C_ptr = Tout(running_sum); - C_ptr += 1; - B_base += 1; /* Next iteration: same row of A (A_base unchanged), - next column of B (B_base increases by 1) */ + C[row * N + col] = Tout(running_sum); } - A_base += K; // Advance to next row of A - B_base = B_origin; /* Next row of A means we need to restart at the first - column of B. */ } } + template Tout mul_acc(int M, int N, int K, int row, int col, const std::vector A, const std::vector B) { From 5af0e9d6d1b2d082ded44ccaa21b347e51b90726 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 13:45:26 -0700 Subject: [PATCH 04/17] [matmul] simplify whole_array strides to element size after PR #1538 --- .../matrix_multiplication/whole_array/aie2.py | 96 +++++-------------- 1 file changed, 25 insertions(+), 71 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 022e3b975f..88e5346cc2 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -63,45 +63,13 @@ def my_matmul(M, K, N, m, k, n): assert k % s == 0 assert n % t == 0 - word_size_in = 2 - word_size_out = 2 - # If you get errors during CDO generation due to running out of program # memory, it may be because too much code is generated due to ObjectFIFO # loop unrollings. Reducing the depth to 1 here will work around that at # a big performance cost. fifo_depth = 1 - A_sz_in_i32s = M * K * word_size_in // 4 - B_sz_in_i32s = K * N * word_size_in // 4 - C_sz_in_bytes = M * N * word_size_out - C_sz_in_i32s = C_sz_in_bytes // 4 - - M_div_m = M // m - M_div_m_div_n_rows = M // (m * n_rows) - K_div_k = K // k - N_div_n = N // n - tiles = M_div_m * N_div_n // n_cores - N_div_n_div_n_cols = N_div_n // n_cols - - # Matrix A: MxK, submatrices a: mxk - k_in_i32s = k * word_size_in // 4 - K_in_i32s = K * word_size_in // 4 - m_x_n_rows = m * n_rows - - # Matrix B: KxN, submatrices b: kxn - n_in_i32s = n * word_size_in // 4 - N_in_i32s = N * word_size_in // 4 - k_x_N_in_i32s = k * N * word_size_in // 4 - n_x_n_cols_in_i32s = n_in_i32s * n_cols - - # Output Matrix C: MxN - n_in_i32s_out = n * word_size_out // 4 - N_in_i32s_out = N * word_size_out // 4 - m_x_n_rows_x_N_in_i32s_out = m * n_rows * N_in_i32s_out - n_x_n_cols_in_i32s_out = n_in_i32s_out * n_cols - - vectorized = True + n_tiles = (M // m) * (N // n) // n_cores with mlir_mod_ctx() as ctx: @@ -293,7 +261,7 @@ def device_body(): def core_body(): for _ in for_(0xFFFFFFFF): for _ in ( - for_(tiles) if tiles > 1 else range(1) + for_(n_tiles) if n_tiles > 1 else range(1) ): # Workaround for issue #1547 elem_out = memC_fifos[j][memC_fifo_names[j][i]].acquire( ObjectFifoPort.Produce, @@ -301,7 +269,7 @@ def core_body(): ) call(zero, [elem_out]) - for _ in for_(K_div_k): + for _ in for_(K // k): elem_in_a = memA_fifos[memA_fifo_names[i]].acquire( ObjectFifoPort.Consume, 1, @@ -324,82 +292,68 @@ def core_body(): ) yield_([]) - if tiles > 1: # workaround for issue #1547 + if n_tiles > 1: # workaround for issue #1547 yield_([]) # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz_in_i32s, T.i32()), - T.memref(B_sz_in_i32s, T.i32()), - T.memref(C_sz_in_i32s, T.i32()), + T.memref(M * K, T.bf16()), + T.memref(K * N, T.bf16()), + T.memref(M *N , T.bf16()), ) def sequence(A, B, C): # only do 5 tile rows at a time before synchronizing, so we can reuse BDs rows_per_block = 5 for tile_row_block in range( - (M_div_m_div_n_rows + rows_per_block - 1) // rows_per_block + (M // m // n_rows + rows_per_block - 1) // rows_per_block ): num_tile_rows = min( [ rows_per_block, - M_div_m_div_n_rows - tile_row_block * rows_per_block, + M // m // n_rows - tile_row_block * rows_per_block, ] ) C_row_offset = ( - tile_row_block * rows_per_block * m * n_rows * N * word_size_out + tile_row_block * rows_per_block * m * n_rows * N ) for i in range(n_cols): - C_col_offset = i * n * word_size_out - C_offset_in_i32s = (C_col_offset + C_row_offset) // 4 + C_col_offset = i * n + C_offset = (C_col_offset + C_row_offset) * 2 npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, mem=C, - offsets=[0, 0, 0, C_offset_in_i32s], - sizes=[ - num_tile_rows, - N_div_n_div_n_cols, - m_x_n_rows, - n_in_i32s_out, - ], - strides=[ - m_x_n_rows_x_N_in_i32s_out, - n_x_n_cols_in_i32s_out, - N_in_i32s_out, + offsets=[0, 0, 0, C_offset], + sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n], + strides=[m * n_rows * N, n * n_cols, N ], ) for tile_row in range(num_tile_rows): - A_row_offset_in_i32s = ( + A_row_offset = ( ((tile_row_block * rows_per_block) + tile_row) * n_rows * m * K - * word_size_in - // 4 ) - A_col_offset_in_i32s = i * m * K * word_size_in // 4 - B_col_offset_in_i32s = i * n * word_size_in // 4 + A_col_offset = i * m * K + A_offset = A_row_offset + A_col_offset + B_col_offset = i * n npu_dma_memcpy_nd( metadata=inA_fifo_names[i], bd_id=2 * tile_row + 1, mem=A, - offsets=[ - 0, - 0, - 0, - A_col_offset_in_i32s + A_row_offset_in_i32s, - ], - sizes=[N_div_n_div_n_cols, K_div_k, m, k_in_i32s], - strides=[0, k_in_i32s, K_in_i32s], + offsets=[0, 0, 0, A_offset ], + sizes=[N // n // n_cols, K // k, m, k], + strides=[0, k, K], ) npu_dma_memcpy_nd( metadata=inB_fifo_names[i], bd_id=2 * tile_row + 2, mem=B, - offsets=[0, 0, 0, B_col_offset_in_i32s], - sizes=[N_div_n_div_n_cols, K_div_k, k, n_in_i32s], - strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s], + offsets=[0, 0, 0, B_col_offset], + sizes=[N // n // n_cols, K // k, k, n], + strides=[n * n_cols, k * N, N], ) for i in range(n_cols): npu_sync(column=i, row=0, direction=0, channel=0) From cf43febf95f430be111c737f78cac5933003d6dc Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:17:08 -0700 Subject: [PATCH 05/17] [matmul] offsets seem to still be in bytes; fix --- .../basic/matrix_multiplication/single_core/aie2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index ba312aa417..cf298e2645 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -194,7 +194,7 @@ def sequence(A, B, C): for tile_row in range(num_tile_rows): A_row_offset = ( ((tile_row_block * rows_per_block) + tile_row) * m * K - ) + ) * 2 npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, From 2529d275f6db1420816d13c2ef08c9df913d3527 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:26:42 -0700 Subject: [PATCH 06/17] [matmul] fix matrix printing error --- programming_examples/basic/matrix_multiplication/common.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index f045835698..d5997a616e 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -130,7 +130,6 @@ void matmul(int M, int N, int K, const std::vector A, } } - template Tout mul_acc(int M, int N, int K, int row, int col, const std::vector A, const std::vector B) { @@ -200,7 +199,8 @@ void print_matrix(const std::vector matrix, int n_cols, if (elide_cols) { \ ostream << std::setw(0) << elide_sym; \ } \ - for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) { \ + for (int i = 0; i < (n_printable_cols - 1) / 2; i++) { \ + int col = n_cols - (n_printable_cols - 1) / 2 + i; \ ostream << std::right << std::setw(w) << (what); \ ostream << std::setw(0) << col_sep; \ } @@ -213,7 +213,8 @@ void print_matrix(const std::vector matrix, int n_cols, print_row(elide_sym); ostream << std::endl; } - for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) { + for (int i = 0; i < (n_printable_rows - 1) / 2; i++) { + int row = n_rows - (n_printable_rows - 1) / 2 + i; print_row(matrix[row * n_cols + col]); ostream << std::endl; } From d76ce27ad6e5b4b2dfdd592dac894495d370ed68 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:28:28 -0700 Subject: [PATCH 07/17] [matmul] allow single_core overall and tile size to be adjusted --- .../single_core/Makefile | 20 +++- .../matrix_multiplication/single_core/aie2.py | 105 ++++++++++++------ 2 files changed, 87 insertions(+), 38 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile index ca92224df8..92f48a545a 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/Makefile +++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile @@ -10,12 +10,20 @@ subdir=single_core targetname=matrixMultiplication kernels=mm -# Currently does not accept reconfiguring size via these variables; must change -# in source at aie2.py as well as here -M=256 -K=256 -N=256 +M?=256 +K?=256 +N?=256 +m?=64 +k?=64 +n?=64 + +kernels=mm_${m}x${k}x${n} +aieargs+=-m $m -k $k -n $n +target_suffix=${M}x${K}x${N}_${m}x${k}x${n} -SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST))) include ${SELF_DIR}../makefile-common +build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -DDIM_N=${n} -c $< -o ${@F} + diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index cf298e2645..6b6a48e400 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -5,24 +5,45 @@ # # (c) Copyright 2023 AMD Inc. +import sys +import argparse + +from aie.extras.context import mlir_mod_ctx from aie.dialects.aie import * from aie.dialects.aiex import * from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx import aie.utils.trace as trace_utils -def my_matmul(): - M = 256 - K = 256 - N = 256 - m = 64 - k = 64 - n = 64 +def main(): + argparser = argparse.ArgumentParser( + prog="AIE Matrix Multiplication MLIR Design (Whole Array)", + description="Emits MLIR code for a matrix multiplication design of the given input size", + ) + argparser.add_argument("-M", type=int, default=256) + argparser.add_argument("-K", type=int, default=256) + argparser.add_argument("-N", type=int, default=256) + argparser.add_argument("-m", type=int, default=64) + argparser.add_argument("-k", type=int, default=64) + argparser.add_argument("-n", type=int, default=64) + args = argparser.parse_args() + my_matmul(args.M, args.K, args.N, args.m, args.k, args.n) + + +def my_matmul(M, K, N, m, k, n): + + assert M % m == 0 + assert K % k == 0 + assert N % n == 0 + r = 4 s = 8 t = 4 + assert m % r == 0 + assert k % s == 0 + assert n % t == 0 + vectorized = True enable_tracing = False trace_size = 65536 @@ -81,12 +102,16 @@ def device_body(): compute_tile2, 2, memref_a_ty, - [ - (m // r, r * k), - (k // s, s), - (r, k), - (s, 1), - ], + ( + [ + (m // r, r * k), + (k // s, s), + (r, k), + (s, 1), + ] + if vectorized + else [] + ), ) object_fifo_link(inA, memA) @@ -98,12 +123,16 @@ def device_body(): compute_tile2, 2, memref_b_ty, - [ - (k // s, s * n), - (n // t, t), - (s, n), - (t, 1), - ], + ( + [ + (k // s, s * n), + (n // t, t), + (s, n), + (t, 1), + ] + if vectorized + else [] + ), ) object_fifo_link(inB, memB) @@ -115,12 +144,16 @@ def device_body(): shim_tile, 2, memref_c_ty, - [ - (m // r, r * n), - (r, t), - (n // t, r * t), - (t, 1), - ], + ( + [ + (m // r, r * n), + (r, t), + (n // t, r * t), + (t, 1), + ] + if vectorized + else [] + ), ) object_fifo_link(memC, outC) @@ -131,17 +164,19 @@ def device_body(): # Set up compute tiles # Compute tile 2 - @core(compute_tile2, "mm.o") + @core(compute_tile2, f"mm_{m}x{k}x{n}.o") def core_body(): for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in for_(tiles) if tiles > 1 else range(1): # issue #1547 elem_out = memC.acquire(ObjectFifoPort.Produce, 1) if vectorized: call(zero, [elem_out]) else: call(zero_scalar, [elem_out]) - for _ in for_(K_div_k): + for _ in ( + for_(K_div_k) if K_div_k > 1 else range(1) + ): # issue #1547 elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1) elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1) if vectorized: @@ -150,10 +185,12 @@ def core_body(): call(matmul_scalar, [elem_in_a, elem_in_b, elem_out]) memA.release(ObjectFifoPort.Consume, 1) memB.release(ObjectFifoPort.Consume, 1) - yield_([]) + if K_div_k > 1: + yield_([]) memC.release(ObjectFifoPort.Produce, 1) - yield_([]) + if tiles > 1: + yield_([]) yield_([]) # To/from AIE-array data movement @@ -216,4 +253,8 @@ def sequence(A, B, C): print(ctx.module) -my_matmul() +if __name__ == "__main__": + main() +else: + print("Not meant to be imported") + sys.exit(1) From 9f86e90b7d73cd8e74bc8342c134bf581eb1e402 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:28:44 -0700 Subject: [PATCH 08/17] [matmul] fix typo in makefile-common --- .../basic/matrix_multiplication/makefile-common | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index bd1eb11409..1e095eda07 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -38,12 +38,12 @@ N?=512 trace_size?=65536 -target_suffix?=${M}x${K}x${n} +target_suffix?=${M}x${K}x${N} mlir_target?=build/aie_${target_suffix}.mlir xclbin_target?=build/final_${target_suffix}.xclbin insts_target?=build/insts_${target_suffix}.txt -runargs?=-v 1 --warmup 1 --iters 1 +runargs?=-v 2 --warmup 1 --iters 1 aieargs+=-M $M -K $K -N $N kernels_dir=${srcdir}/../../../../aie_kernels/aie2 From 20349d9a92a614f6d9fc97867bb6089368ab5f66 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:29:38 -0700 Subject: [PATCH 09/17] [matmul] restore fifo depth to two for whole_array --- .../basic/matrix_multiplication/whole_array/aie2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 88e5346cc2..17c748ad0d 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -67,7 +67,7 @@ def my_matmul(M, K, N, m, k, n): # memory, it may be because too much code is generated due to ObjectFIFO # loop unrollings. Reducing the depth to 1 here will work around that at # a big performance cost. - fifo_depth = 1 + fifo_depth = 2 n_tiles = (M // m) * (N // n) // n_cores From 654413456b346edb709968d862d5ee58f2b41e2e Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:36:05 -0700 Subject: [PATCH 10/17] [matmul] express offets in bytes in whole_array design; reformat --- .../matrix_multiplication/whole_array/aie2.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 17c748ad0d..4adb1cd7e2 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -300,7 +300,7 @@ def core_body(): @FuncOp.from_py_func( T.memref(M * K, T.bf16()), T.memref(K * N, T.bf16()), - T.memref(M *N , T.bf16()), + T.memref(M * N, T.bf16()), ) def sequence(A, B, C): # only do 5 tile rows at a time before synchronizing, so we can reuse BDs @@ -314,9 +314,7 @@ def sequence(A, B, C): M // m // n_rows - tile_row_block * rows_per_block, ] ) - C_row_offset = ( - tile_row_block * rows_per_block * m * n_rows * N - ) + C_row_offset = tile_row_block * rows_per_block * m * n_rows * N for i in range(n_cols): C_col_offset = i * n C_offset = (C_col_offset + C_row_offset) * 2 @@ -326,8 +324,7 @@ def sequence(A, B, C): mem=C, offsets=[0, 0, 0, C_offset], sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n], - strides=[m * n_rows * N, n * n_cols, N - ], + strides=[m * n_rows * N, n * n_cols, N], ) for tile_row in range(num_tile_rows): A_row_offset = ( @@ -337,13 +334,13 @@ def sequence(A, B, C): * K ) A_col_offset = i * m * K - A_offset = A_row_offset + A_col_offset - B_col_offset = i * n + A_offset = (A_row_offset + A_col_offset) * 2 + B_col_offset = i * n * 2 npu_dma_memcpy_nd( metadata=inA_fifo_names[i], bd_id=2 * tile_row + 1, mem=A, - offsets=[0, 0, 0, A_offset ], + offsets=[0, 0, 0, A_offset], sizes=[N // n // n_cols, K // k, m, k], strides=[0, k, K], ) From c6974592f7536c3f012de814d72fa1089ecc1c5c Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:44:13 -0700 Subject: [PATCH 11/17] [matmul] reduce verification tolerance to 5% relative, 0.5 absolute --- programming_examples/basic/matrix_multiplication/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index d5997a616e..1396a39d45 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -236,7 +236,7 @@ template std::optional> verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) { const float absTol = 0.5; - const float relTol = 0.15; + const float relTol = 0.05; if (!nearly_equal(expected, actual, relTol, absTol)) { return (struct error){row, col, expected, actual}; } From 07c029cf76309f6797d63e2b6c8974a1a66d5fd1 Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 12 Jun 2024 16:59:19 -0700 Subject: [PATCH 12/17] [matmul] format --- aie_kernels/aie2/mm.cc | 3 ++- programming_examples/basic/matrix_multiplication/test.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) mode change 100755 => 100644 aie_kernels/aie2/mm.cc diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc old mode 100755 new mode 100644 index 35437afb40..0444fa6018 --- a/aie_kernels/aie2/mm.cc +++ b/aie_kernels/aie2/mm.cc @@ -431,7 +431,8 @@ extern "C" { r, s, t) \ void matmul_scalar_##mlir_type_in##_##mlir_type_out( \ ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \ - matmul_scalar(a_in, b_in, c_out);\ + matmul_scalar(a_in, b_in, \ + c_out); \ } #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index e3786b0d2a..c838f30aeb 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -140,6 +140,7 @@ int main(int argc, const char *argv[]) { std::vector AVec(A_VOLUME); for (int i = 0; i < A_VOLUME; i++) { AVec[i] = matmul_common::random_bfloat16_t(); + // AVec[i] = i; } memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); B_DATATYPE *bufB = bo_b.map(); From 1dc7a6adddb3fc8a09a1c2e0ca88d377d5afd003 Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 13 Jun 2024 08:27:50 -0700 Subject: [PATCH 13/17] [matmul] fix CI test errors --- .../basic/matrix_multiplication/matrix_vector/test.cpp | 1 + .../basic/matrix_multiplication/single_core/Makefile | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp index 000d47499c..862256fa15 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp @@ -14,5 +14,6 @@ using A_DATATYPE = std::bfloat16_t; using B_DATATYPE = std::bfloat16_t; using C_DATATYPE = float; +using ACC_DATATYPE = float; #include "../test.cpp" diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile index 92f48a545a..1142d7357e 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/Makefile +++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile @@ -6,6 +6,7 @@ # ##===----------------------------------------------------------------------===## +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) subdir=single_core targetname=matrixMultiplication kernels=mm @@ -21,7 +22,7 @@ kernels=mm_${m}x${k}x${n} aieargs+=-m $m -k $k -n $n target_suffix=${M}x${K}x${N}_${m}x${k}x${n} -include ${SELF_DIR}../makefile-common +include ${srcdir}/../makefile-common build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc mkdir -p ${@D} From 762d866b2bc1cb7a68e8402179785e41b3ad255b Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 14 Jun 2024 14:37:58 -0700 Subject: [PATCH 14/17] [matmul] fix matrix printing rounding error --- .../basic/matrix_multiplication/common.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index 1396a39d45..b1a7e92347 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -171,7 +171,7 @@ void print_matrix(const std::vector matrix, int n_cols, assert(matrix.size() % n_cols == 0); auto maxima = std::minmax_element(matrix.begin(), matrix.end()); - T max_val = std::max(*maxima.first, std::abs(*maxima.second)); + T max_val = std::max(*maxima.first, (T)std::abs(*maxima.second)); size_t n_digits = log10(max_val); if (w == -1) { w = n_digits; @@ -199,8 +199,8 @@ void print_matrix(const std::vector matrix, int n_cols, if (elide_cols) { \ ostream << std::setw(0) << elide_sym; \ } \ - for (int i = 0; i < (n_printable_cols - 1) / 2; i++) { \ - int col = n_cols - (n_printable_cols - 1) / 2 + i; \ + for (int i = 0; i < n_printable_cols / 2; i++) { \ + int col = n_cols - n_printable_cols / 2 + i; \ ostream << std::right << std::setw(w) << (what); \ ostream << std::setw(0) << col_sep; \ } @@ -213,8 +213,8 @@ void print_matrix(const std::vector matrix, int n_cols, print_row(elide_sym); ostream << std::endl; } - for (int i = 0; i < (n_printable_rows - 1) / 2; i++) { - int row = n_rows - (n_printable_rows - 1) / 2 + i; + for (int i = 0; i < n_printable_rows / 2; i++) { + int row = n_rows - n_printable_rows / 2 + i; print_row(matrix[row * n_cols + col]); ostream << std::endl; } From cfe3bbf6cc71d55f065dc8d961f9fe75cfe80f8f Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 14 Jun 2024 14:49:30 -0700 Subject: [PATCH 15/17] [matvec] use integers to avoid float errors; swap in scalar kernel for now to pass verification --- aie_kernels/aie2/mv.cc | 36 +++++++---- .../matrix_vector/Makefile | 9 ++- .../matrix_vector/aie2.py | 60 ++++++++++++------- .../matrix_vector/test.cpp | 9 +-- 4 files changed, 78 insertions(+), 36 deletions(-) diff --git a/aie_kernels/aie2/mv.cc b/aie_kernels/aie2/mv.cc index 7ac7903c53..42829ffd6b 100644 --- a/aie_kernels/aie2/mv.cc +++ b/aie_kernels/aie2/mv.cc @@ -45,7 +45,8 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b, static_assert(s == 8); // s is fixed to 8 because that is the number of // column vectors (a_vec_0_0..a_vec_3_1) we create static_assert(k % s == 0); - static_assert(std::is_same::value); + static_assert(std::is_same::value || + std::is_same::value); // This kernel expects a "32-bit word transposed matrix", i.e. the result // of transposing the row-major representation of the matrix at a @@ -80,12 +81,12 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b, aie::accum c_acc_in; c_acc_in.from_vector(aie::load_v(c_ptr)); - const aie::vector a_vec_0 = aie::load_v<2 * r>(a_ptr); - const aie::vector a_vec_1 = + const aie::vector a_vec_0 = aie::load_v<2 * r>(a_ptr); + const aie::vector a_vec_1 = aie::load_v<2 * r>(a_ptr + 2 * m); - const aie::vector a_vec_2 = + const aie::vector a_vec_2 = aie::load_v<2 * r>(a_ptr + 4 * m); - const aie::vector a_vec_3 = + const aie::vector a_vec_3 = aie::load_v<2 * r>(a_ptr + 6 * m); // The even/odd calls below extract the interleaved columns of A. @@ -133,35 +134,48 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b, extern "C" { +// If you want to compile microkernels with different inner tile sizes, +// define DIM_M and DIM_K at compile time using -DDIM_M 16 etc. +// These dimensions must be divisible by the r, s dimensions used in +// the kernels. + +#ifndef DIM_M +#define DIM_M 32 +#endif + +#ifndef DIM_K +#define DIM_K 32 +#endif + #define combos(X) \ - X(bfloat16, bf16, float, f32, accfloat) \ -// X(int16, i16, int16, i16, acc32) \ + /* X(bfloat16, bf16, float, f32, accfloat) */ \ + X(int16, i16, int32, i32, acc32) #define matvec_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ ctype_acc) \ void matvec_scalar_##mlir_type_in##_##mlir_type_out( \ ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \ - matvec_scalar(a_in, b_in, c_out); \ + matvec_scalar(a_in, b_in, c_out); \ } #define matvec_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ mlir_type_out, ctype_acc) \ void matvec_vectorized_##mlir_type_in##_##mlir_type_out( \ ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \ - matvec_vectorized( \ + matvec_vectorized( \ a_in, b_in, c_out); \ } #define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ mlir_type_out, ctype_acc) \ void zero_vectorized_##mlir_type_out(ctype_out *c_out) { \ - zero_vectorized(c_out); \ + zero_vectorized(c_out); \ } #define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ ctype_acc) \ void zero_scalar_##mlir_type_out(ctype_out *c_out) { \ - zero_scalar(c_out); \ + zero_scalar(c_out); \ } combos(matvec_scalar_c_func) combos(matvec_vectorized_c_func) diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile index c701ce9a50..c86e4761af 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile @@ -8,14 +8,21 @@ subdir=matrix_vector targetname=matrixVectorMultiplication -kernels=mv # Currently does not accept reconfiguring size via these variables; must change # in source at aie2.py as well as here M=288 K=288 N=1 +m=32 +k=32 + +kernels=mv_${m}x${k} SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST))) include ${SELF_DIR}../makefile-common +build/mv_${m}x${k}.o: ${kernels_dir}/mv.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -DDIM_M=${m} -DDIM_K=${k} -c $< -o ${@F} + diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 54276121c8..0657d2fb10 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -32,26 +32,36 @@ def my_matmul(): m_x_k = m * k m_x_K = m * K - vectorized = True + # FIXME vectorized kernel is currently erroneous + vectorized = False + + dtype_in = T.i16 + dtype_in_str = "i16" + dtype_out = T.i32 + dtype_out_str = "i32" with mlir_mod_ctx() as ctx: @device(AIEDevice.npu1_4col) def device_body(): - memRef_inA_ty = T.memref(m * k, T.bf16()) - memRef_inB_ty = T.memref(k, T.bf16()) - memRef_outC_ty = T.memref(m, T.f32()) - memRef_A_ty = T.memref(m, k, T.bf16()) + memRef_inA_ty = T.memref(m * k, dtype_in()) + memRef_inB_ty = T.memref(k, dtype_in()) + memRef_outC_ty = T.memref(m, dtype_out()) + memRef_A_ty = T.memref(m, k, dtype_in()) # AIE Core Function declarations - zero_scalar = external_func("zero_scalar_f32", inputs=[memRef_outC_ty]) - zero = external_func("zero_vectorized_f32", inputs=[memRef_outC_ty]) + zero_scalar = external_func( + f"zero_scalar_{dtype_out_str}", inputs=[memRef_outC_ty] + ) + zero = external_func( + f"zero_vectorized_{dtype_out_str}", inputs=[memRef_outC_ty] + ) matvec_scalar = external_func( - "matvec_scalar_bf16_f32", + f"matvec_scalar_{dtype_in_str}_{dtype_out_str}", inputs=[memRef_A_ty, memRef_inB_ty, memRef_outC_ty], ) matvec = external_func( - "matvec_vectorized_bf16_f32", + f"matvec_vectorized_{dtype_in_str}_{dtype_out_str}", inputs=[memRef_A_ty, memRef_inB_ty, memRef_outC_ty], ) @@ -96,11 +106,15 @@ def device_body(): cores[i], 2, memRef_A_ty, - [ - (k // 2 // 2, 2), - (m, k), - (2, 1), - ], # transpose at 4-byte (2xbf16) granularity + ( + [ + (k // 2 // 2, 2), + (m, k), + (2, 1), + ] + if vectorized + else [] + ), # transpose at 4-byte (2xbf16) granularity ) object_fifo_link( memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]] @@ -128,14 +142,17 @@ def device_body(): # Set up compute tiles for i in range(n_cores): # Compute tile i - @core(cores[i], "mv.o") + @core(cores[i], f"mv_{m}x{k}.o") def core_body(): for _ in for_(0xFFFFFFFF): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1, ) - call(zero, [elem_out]) + if vectorized or True: + call(zero, [elem_out]) + else: + call(zero_scalar, [elem_out]) for _ in for_(K_div_k): elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( @@ -146,7 +163,10 @@ def core_body(): ObjectFifoPort.Consume, 1, ) - call(matvec, [elem_in_a, elem_in_b, elem_out]) + if vectorized: + call(matvec, [elem_in_a, elem_in_b, elem_out]) + else: + call(matvec_scalar, [elem_in_a, elem_in_b, elem_out]) inA_fifos[inA_fifo_names[i]].release( ObjectFifoPort.Consume, 1, @@ -166,9 +186,9 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz, T.bf16()), - T.memref(B_sz, T.bf16()), - T.memref(C_sz, T.f32()), + T.memref(A_sz, dtype_in()), + T.memref(B_sz, dtype_in()), + T.memref(C_sz, dtype_out()), ) def sequence(A, B, C): npu_dma_memcpy_nd( diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp index 862256fa15..eb41adafa3 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp @@ -9,11 +9,12 @@ //===----------------------------------------------------------------------===// #include +#include #define DATATYPES_USING_DEFINED -using A_DATATYPE = std::bfloat16_t; -using B_DATATYPE = std::bfloat16_t; -using C_DATATYPE = float; -using ACC_DATATYPE = float; +using A_DATATYPE = int16_t; // std::bfloat16_t; +using B_DATATYPE = int16_t; // std::bfloat16_t; +using C_DATATYPE = int32_t; // float; +using ACC_DATATYPE = int32_t; #include "../test.cpp" From df84e5bae4cb0a03b1ac542bd03b305620e1120b Mon Sep 17 00:00:00 2001 From: andrej Date: Sun, 23 Jun 2024 13:01:30 -0700 Subject: [PATCH 16/17] [matmul] add missing includes to common.h to make it work standalone --- programming_examples/basic/matrix_multiplication/common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index b1a7e92347..67518b841b 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include namespace matmul_common { From 828b13da810867fecb00ed40a670cffaa13f9392 Mon Sep 17 00:00:00 2001 From: andrej Date: Sun, 23 Jun 2024 21:06:56 -0700 Subject: [PATCH 17/17] format --- aie_kernels/aie2/mv.cc | 8 ++++---- programming_examples/basic/matrix_multiplication/common.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/aie_kernels/aie2/mv.cc b/aie_kernels/aie2/mv.cc index 42829ffd6b..42c9fc4b2d 100644 --- a/aie_kernels/aie2/mv.cc +++ b/aie_kernels/aie2/mv.cc @@ -81,12 +81,12 @@ void matvec_vectorized(T_in *__restrict a, T_in *__restrict b, aie::accum c_acc_in; c_acc_in.from_vector(aie::load_v(c_ptr)); - const aie::vector a_vec_0 = aie::load_v<2 * r>(a_ptr); - const aie::vector a_vec_1 = + const aie::vector a_vec_0 = aie::load_v<2 * r>(a_ptr); + const aie::vector a_vec_1 = aie::load_v<2 * r>(a_ptr + 2 * m); - const aie::vector a_vec_2 = + const aie::vector a_vec_2 = aie::load_v<2 * r>(a_ptr + 4 * m); - const aie::vector a_vec_3 = + const aie::vector a_vec_3 = aie::load_v<2 * r>(a_ptr + 6 * m); // The even/odd calls below extract the interleaved columns of A. diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index 67518b841b..b2c6c14b53 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -15,13 +15,13 @@ #define MATRIX_MULTIPLICATION_H #include +#include #include #include +#include #include #include -#include #include -#include namespace matmul_common {