Skip to content

Commit

Permalink
Fix ukernel implemented in mm.cc (#1499)
Browse files Browse the repository at this point in the history
  • Loading branch information
newling authored May 17, 2024
1 parent 9bc55e0 commit 7fb9fad
Showing 1 changed file with 29 additions and 4 deletions.
33 changes: 29 additions & 4 deletions aie_kernels/mm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,27 @@

#include "zero.cc"

// Suppose A is a 64x64 tensor and B is a 64x64 tensor, and r=4, s=8, t=4.
//
// Let A[i,j] be the element at row i and column j of A, and
// B[i,j] be the element at row i and column j of B.
//
// The expectations of this function on the points pA, pB, and pC are:
//
// 1) all elements of A are contiguous in memory, starting from pA + offsetA
// 2) all elements of B are contiguous in memory, starting from pB + offsetB
// 3) all elements of C are contiguous in memory, starting from pC + offsetC
// 4) element A[i,j] is at pA[offsetA + i*8 + (64*8)*(j/8) + j%8]
// 5) element B[i,j] is at pB[offsetB + i*4 + (64*4)*(j/4) + j%4]
//
// 4) and 5) describe vertical stripes of A and B that are stored contiguously,
// with a row-major order within each stripe. i.e. elements starting at ptrA +
// offsetA are:
//
// [A[0,0], ..., A[0,7], A[1,0], ..., A[1,7], A[2,0], ..., A[2,7], ... A[63,0],
// ..., A[63,7], A[0,8], ..., A[0,15], ..., A[63, 64]]
//

template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
unsigned colB, unsigned r, unsigned s, unsigned t>
void matmul_vectorized(const T_in *__restrict pA, unsigned offsetA,
Expand All @@ -47,10 +68,14 @@ void matmul_vectorized(const T_in *__restrict pA, unsigned offsetA,
const T_in *__restrict pA3 = pA + offsetA + ((z + 2)) * MMUL::size_A;
const T_in *__restrict pA4 = pA + offsetA + ((z + 3)) * MMUL::size_A;

const T_in *__restrict pB1 = pB + offsetB + (j)*MMUL::size_B;
const T_in *__restrict pB2 = pB + offsetB + ((j + 1)) * MMUL::size_B;
const T_in *__restrict pB3 = pB + offsetB + ((j + 2)) * MMUL::size_B;
const T_in *__restrict pB4 = pB + offsetB + ((j + 3)) * MMUL::size_B;
const T_in *__restrict pB1 =
pB + offsetB + ((j + 0)) * colA * MMUL::size_B;
const T_in *__restrict pB2 =
pB + offsetB + ((j + 1)) * colA * MMUL::size_B;
const T_in *__restrict pB3 =
pB + offsetB + ((j + 2)) * colA * MMUL::size_B;
const T_in *__restrict pB4 =
pB + offsetB + ((j + 3)) * colA * MMUL::size_B;

aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
pA1 += rowA * MMUL::size_A;
Expand Down

0 comments on commit 7fb9fad

Please sign in to comment.