Skip to content

Commit

Permalink
Changes to NpuDmaMemcpyNdOp and AIEDmaToNpu to support sub-word strid…
Browse files Browse the repository at this point in the history
…es, offsets and sizes (#1538)

Co-authored-by: Joseph Melber <jgmelber@gmail.com>
  • Loading branch information
pvasireddy-amd and jgmelber authored Jun 11, 2024
1 parent 78e8a43 commit 0504f7a
Show file tree
Hide file tree
Showing 26 changed files with 254 additions and 221 deletions.
4 changes: 4 additions & 0 deletions include/aie-c/TargetModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ DEFINE_C_API_STRUCT(AieTargetModel, uint64_t);

MLIR_CAPI_EXPORTED AieTargetModel aieGetTargetModel(uint32_t device);

/// Returns the data bus width for the target model.
MLIR_CAPI_EXPORTED uint32_t
aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel);

/// Returns the number of columns in the target model.
MLIR_CAPI_EXPORTED int aieTargetModelColumns(AieTargetModel targetModel);

Expand Down
9 changes: 9 additions & 0 deletions include/aie/Dialect/AIE/IR/AIETargetModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class AIETargetModel {
/// Return the target architecture.
virtual AIEArch getTargetArch() const = 0;

/// Return the data bus width of the device.
virtual uint32_t getAddressGenGranularity() const = 0;

/// Return the number of columns in the device.
virtual int columns() const = 0;

Expand Down Expand Up @@ -293,6 +296,8 @@ class AIE2TargetModel : public AIETargetModel {

AIEArch getTargetArch() const override;

uint32_t getAddressGenGranularity() const override { return 32; }

std::optional<TileID> getMemWest(TileID src) const override;
std::optional<TileID> getMemEast(TileID src) const override;
std::optional<TileID> getMemNorth(TileID src) const override;
Expand Down Expand Up @@ -352,6 +357,8 @@ class VC1902TargetModel : public AIE1TargetModel {
public:
VC1902TargetModel() = default;

uint32_t getAddressGenGranularity() const override { return 32; }

int columns() const override { return 50; }

int rows() const override { return 9; /* One Shim row and 8 Core rows. */ }
Expand Down Expand Up @@ -532,6 +539,8 @@ class VirtualizedNPUTargetModel : public BaseNPUTargetModel {
public:
VirtualizedNPUTargetModel(int _cols) : cols(_cols) {}

uint32_t getAddressGenGranularity() const override { return 32; }

int columns() const override { return cols; }

bool isShimNOCTile(int col, int row) const override { return row == 0; }
Expand Down
4 changes: 4 additions & 0 deletions lib/CAPI/TargetModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ AieTargetModel aieGetTargetModel(uint32_t device) {
xilinx::AIE::getTargetModel(static_cast<xilinx::AIE::AIEDevice>(device)));
}

uint32_t aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel) {
return unwrap(targetModel).getAddressGenGranularity();
}

int aieTargetModelColumns(AieTargetModel targetModel) {
return unwrap(targetModel).columns();
}
Expand Down
12 changes: 10 additions & 2 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,16 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {

LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
MemRefType buffer = getMemref().getType();
if (buffer.getElementTypeBitWidth() != 32)
return emitOpError("must be used with memref type with element width 32.");
const auto &targetModel = AIE::getTargetModel(*this);
auto addressGranularity = targetModel.getAddressGenGranularity();
if (buffer.getElementTypeBitWidth() > addressGranularity) {
return emitOpError("Maximum element bit width allowed is ")
<< addressGranularity << "bits. ";
} else if ((buffer.getNumElements() * buffer.getElementTypeBitWidth()) <
addressGranularity) {
return emitOpError("Minimum data transfer size required is ")
<< addressGranularity << "bits. ";
}
if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
return getConstantIntValue(s).has_value();
}))
Expand Down
16 changes: 16 additions & 0 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,22 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
llvm::reverse(op.getMixedOffsets()),
[](OpFoldResult s) { return getConstantIntValue(s).value(); });

MemRefType buffer = op.getMemref().getType();
const auto &targetModel = AIE::getTargetModel(op);
auto elemWidth = buffer.getElementTypeBitWidth();
auto addressGranularity = targetModel.getAddressGenGranularity();
if (elemWidth < addressGranularity) {
if (!strides.empty()) {
for (int i = 0; i < 3; i++) {
strides[i] = (strides[i] * elemWidth) / addressGranularity;
}
}
if (!sizes.empty())
sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
if (!offsets.empty())
offsets[0] = (offsets[0] * elemWidth) / addressGranularity;
}

// column
column = IntegerAttr::get(i32ty, col);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,20 @@ def my_matmul():
K = 288
m = 32
k = 32
word_size_in = 2
word_size_out = 4

n_cores = 1

A_sz_in_i32s = M * K * word_size_in // 4
B_sz_in_i32s = K * word_size_in // 4
C_sz_in_bytes = M * word_size_out
C_sz_in_i32s = C_sz_in_bytes // 4
C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores
A_sz = M * K
B_sz = K
C_sz = M
C_sz_div_n_cores = C_sz // n_cores

M_div_m = M // m
M_div_m_div_n_cores = M // (m * n_cores)
K_div_k = K // k

K_in_i32s = K * word_size_in // 4
k_in_i32s = k * word_size_in // 4
m_in_i32s = m * word_size_in // 4
m_x_k_in_i32s = m * k * word_size_in // 4
m_x_K_in_i32s = m * K * word_size_in // 4
m_x_k = m * k
m_x_K = m * K

vectorized = True

Expand Down Expand Up @@ -172,35 +166,35 @@ def core_body():
# To/from AIE-array data movement

@FuncOp.from_py_func(
T.memref(A_sz_in_i32s, T.i32()),
T.memref(B_sz_in_i32s, T.i32()),
T.memref(C_sz_in_i32s, T.i32()),
T.memref(A_sz, T.bf16()),
T.memref(B_sz, T.bf16()),
T.memref(C_sz, T.f32()),
)
def sequence(A, B, C):
npu_dma_memcpy_nd(
metadata=inB_fifo_names[0],
bd_id=2,
mem=B,
sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s],
sizes=[M_div_m_div_n_cores, 1, 1, K],
strides=[0, 0, 0],
)
for i in range(n_cores):
A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4
C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4
A_offset = i * M_div_m_div_n_cores * m * K
C_offset = i * M_div_m_div_n_cores * m
npu_dma_memcpy_nd(
metadata=memA_fifo_names[i],
bd_id=1,
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s],
strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s],
sizes=[M_div_m_div_n_cores, K_div_k, m, k],
strides=[m_x_K, k, K],
)
npu_dma_memcpy_nd(
metadata=outC_fifo_names[i],
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s],
sizes=[1, 1, 1, C_sz_div_n_cores],
strides=[0, 0, 0],
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,26 @@ def my_matmul():
r = 4
s = 8
t = 4
word_size_in = 2
word_size_out = 2

vectorized = True
enable_tracing = False
trace_size = 65536

A_sz_in_i32s = M * K * word_size_in // 4
B_sz_in_i32s = K * N * word_size_in // 4
C_sz_in_bytes = M * N * word_size_out
C_sz_in_i32s = C_sz_in_bytes // 4
A_sz = M * K
B_sz = K * N
C_sz = M * N
C_sz_in_bytes = C_sz * 2

M_div_m = M // m
K_div_k = K // k
N_div_n = N // n
tiles = M_div_m * N_div_n

# Matrix A: MxK, submatrices a: mxk
k_in_i32s = k * word_size_in // 4
K_in_i32s = K * word_size_in // 4

# Matrix B: KxN, submatrices b: kxn
n_in_i32s = n * word_size_in // 4
N_in_i32s = N * word_size_in // 4
k_x_N_in_i32s = k * N * word_size_in // 4
k_x_N = k * N

# Output Matrix C: MxN
n_in_i32s_out = n * word_size_out // 4
N_in_i32s_out = N * word_size_out // 4
m_x_N_in_i32s_out = m * N * word_size_out // 4
m_x_N = m * N

with mlir_mod_ctx() as ctx:

Expand Down Expand Up @@ -169,9 +159,9 @@ def core_body():
# To/from AIE-array data movement

@FuncOp.from_py_func(
T.memref(A_sz_in_i32s, T.i32()),
T.memref(B_sz_in_i32s, T.i32()),
T.memref(C_sz_in_i32s, T.i32()),
T.memref(A_sz, T.bf16()),
T.memref(B_sz, T.bf16()),
T.memref(C_sz, T.bf16()),
)
def sequence(A, B, C):

Expand All @@ -189,42 +179,36 @@ def sequence(A, B, C):
for tile_row_block in range(
(M_div_m + rows_per_block - 1) // rows_per_block
):
C_row_offset_in_i32s = (
tile_row_block * rows_per_block * m * N * word_size_out // 4
)
C_row_offset = tile_row_block * rows_per_block * m * N
num_tile_rows = min(
[rows_per_block, M_div_m - tile_row_block * rows_per_block]
)
npu_dma_memcpy_nd(
metadata="outC",
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_row_offset_in_i32s],
sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out],
strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out],
offsets=[0, 0, 0, C_row_offset],
sizes=[num_tile_rows, N_div_n, m, n],
strides=[m_x_N, n, N],
)
for tile_row in range(num_tile_rows):
A_row_offset_in_i32s = (
((tile_row_block * rows_per_block) + tile_row)
* m
* K
* word_size_in
// 4
A_row_offset = (
((tile_row_block * rows_per_block) + tile_row) * m * K
)
npu_dma_memcpy_nd(
metadata="inA",
bd_id=2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_row_offset_in_i32s],
sizes=[N_div_n, K_div_k, m, k_in_i32s],
strides=[0, k_in_i32s, K_in_i32s],
offsets=[0, 0, 0, A_row_offset],
sizes=[N_div_n, K_div_k, m, k],
strides=[0, k, K],
)
npu_dma_memcpy_nd(
metadata="inB",
bd_id=2 * tile_row + 2,
mem=B,
sizes=[N_div_n, K_div_k, k, n_in_i32s],
strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
sizes=[N_div_n, K_div_k, k, n],
strides=[n, k_x_N, N],
)

npu_sync(column=0, row=0, direction=0, channel=0)
Expand Down
11 changes: 4 additions & 7 deletions programming_examples/basic/passthrough_kernel/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
def passthroughKernel(vector_size, trace_size):
N = vector_size
lineWidthInBytes = N // 4 # chop input in 4 sub-tensors
lineWidthInInt32s = lineWidthInBytes // 4

@device(AIEDevice.npu1_1col)
def device_body():
Expand Down Expand Up @@ -58,9 +57,7 @@ def core_body():

# print(ctx.module.operation.verify())

tensorSize = N
tensorSizeInInt32s = tensorSize // 4
tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
tensor_ty = T.memref(N, T.ui8())

@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
Expand All @@ -70,20 +67,20 @@ def sequence(inTensor, outTensor, notUsed):
ShimTile,
ddr_id=1,
size=trace_size,
offset=tensorSize,
offset=N,
)

npu_dma_memcpy_nd(
metadata="in",
bd_id=0,
mem=inTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
sizes=[1, 1, 1, N],
)
npu_dma_memcpy_nd(
metadata="out",
bd_id=1,
mem=outTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
sizes=[1, 1, 1, N],
)
npu_sync(column=0, row=0, direction=0, channel=0)

Expand Down
15 changes: 3 additions & 12 deletions programming_examples/basic/vector_exp/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@
# AI Engine structural design function
def my_eltwise_exp():

word_size_in = 2
N = 65536
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

# Tile sizes
n = 1024
Expand Down Expand Up @@ -103,16 +98,12 @@ def core_body():
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())
tensor_ty = T.memref(N, T.bf16())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):
npu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
npu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_sync(column=0, row=0, direction=0, channel=0)


Expand Down
12 changes: 4 additions & 8 deletions programming_examples/basic/vector_scalar_mul/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@


def my_vector_scalar(vector_size, trace_size):
word_size_in = 2
N = vector_size
N_in_i32s = N * word_size_in // 4
N_in_bytes = N_in_i32s * 4
N_in_bytes = N * 2
N_div_n = 4 # chop input vector into 4 sub-vectors
n = N // N_div_n

Expand Down Expand Up @@ -82,7 +80,7 @@ def core_body():
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N_in_i32s, T.i32())
tensor_ty = T.memref(N, T.i16())
scalar_ty = T.memref(1, T.i32())

@FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
Expand All @@ -96,10 +94,8 @@ def sequence(A, F, C):
size=trace_size,
offset=N_in_bytes,
)
npu_dma_memcpy_nd(
metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s]
)
npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
npu_sync(column=0, row=0, direction=0, channel=0)

Expand Down
Loading

0 comments on commit 0504f7a

Please sign in to comment.