Skip to content

Commit

Permalink
Make aiex.npu.dma_memcpy_nd d0 stride explicit (#1586)
Browse files Browse the repository at this point in the history
  • Loading branch information
fifield authored Jul 2, 2024
1 parent a764c8b commit cd33847
Show file tree
Hide file tree
Showing 41 changed files with 246 additions and 153 deletions.
8 changes: 4 additions & 4 deletions include/aie/Dialect/AIEX/IR/AIEX.td
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
Variadic<I64>:$strides,
ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_offsets,
ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_sizes,
ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<3>]>:$static_strides,
ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_strides,
FlatSymbolRefAttr:$metadata,
I64Attr:$id,
DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token
Expand All @@ -518,7 +518,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
The returned stride[0] is the second-lowest dimension stride, i.e.
stride 1. The lowest stride is currently implicitly one, but this is not
a hardware requirement and could be changed in the future. */
llvm::SmallVector<int64_t, 3> getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> getStridesInAddressGranularity();

/* Returns the multi-dimensional data transfer sizes in units of address
granularity. These sizes are expressed in units of element data type in
Expand All @@ -539,7 +539,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [

let extraClassDefinition = [{
unsigned $cppClass::getOffsetSizeAndStrideStartOperandIndex() { return 1; }
std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {4, 4, 3}; }
std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {4, 4, 4}; }
}];

let hasVerifier = 1;
Expand All @@ -556,7 +556,7 @@ def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> {
...
aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
...
aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
...
aiex.npu.dma_wait { symbol = @out0 }
```
Expand Down
77 changes: 53 additions & 24 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,18 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
return success();
}

llvm::SmallVector<int64_t, 3>
llvm::SmallVector<int64_t, 4>
AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
const auto &targetModel = AIE::getTargetModel(*this);
MemRefType buffer = getMemref().getType();
auto elemWidth = buffer.getElementTypeBitWidth();
auto addressGranularity = targetModel.getAddressGenGranularity();
llvm::SmallVector<int64_t, 3> strides =
llvm::SmallVector<int64_t, 4> strides =
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
if (!strides.empty()) {
for (int i = 0; i < 3; i++) {
for (int i = 0; i < 4; i++) {
strides[i] = (strides[i] * elemWidth) / addressGranularity;
}
}
Expand Down Expand Up @@ -148,7 +148,7 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
}))
return emitOpError("Only constant offsets currently supported.");

llvm::SmallVector<int64_t, 3> raw_strides =
llvm::SmallVector<int64_t, 4> raw_strides =
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
Expand All @@ -157,41 +157,70 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
return getConstantIntValue(s).value();
});

llvm::SmallVector<int64_t, 3> strides = getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> strides = getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> sizes = getSizesInAddressGranularity();
int64_t offset = getOffsetInBytes();

if (sizes[3] > 64)
return emitOpError("Size 3 exceeds the [1:64] range.");
if (strides[1] && sizes[1] > 0x3FF)
return emitOpError("Size 1 exceeds the [0:1023] range.");
if (strides[0] && sizes[0] > 0x3FF)
return emitOpError("Size 0 exceeds the [0:1023] range.");
if (strides[2] > 0x100000)
return emitOpError("Stride 3 exceeds the [1:1M] range.");
if (strides[1] > 0x100000)
return emitOpError("Stride 2 exceeds the [1:1M] range.");
if (strides[0] > 0x100000)
return emitOpError("Stride 1 exceeds the [1:1M] range.");
// The experimental HSA target uses this op on AIE1, skip all the AIE2
// specific checks
if (targetModel.getTargetArch() == AIE::AIEArch::AIE1)
return success();

uint32_t wrap_bits = 0;
uint32_t step_bits = 0;
uint32_t iter_bits = 6;
if (targetModel.isShimNOCTile(getX(), getY())) {
step_bits = 20; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_STEPSIZE_WIDTH
wrap_bits = 10; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
} else if (targetModel.isMemTile(getX(), getY())) {
step_bits = 17; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
wrap_bits = 10; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_WRAP_WIDTH
} else if (targetModel.isCoreTile(getX(), getY())) {
step_bits = 13; // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
wrap_bits = 8; // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
} else {
return emitOpError("Unsupported tile type at (" + std::to_string(getX()) +
", " + std::to_string(getY()) +
") Must be ShimNOC, Mem or Core.");
}

if (sizes[3] > (1 << iter_bits))

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 187 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
return emitOpError(
"Size 3 exceeds the [1:" + std::to_string(1 << iter_bits) + "] range.");
if (strides[2] && sizes[1] > (1 << wrap_bits) - 1)
return emitOpError("Size 1 exceeds the [0:" +
std::to_string((1 << wrap_bits) - 1) + "] range.");
if (strides[1] && sizes[0] > (1 << wrap_bits) - 1)
return emitOpError("Size 0 exceeds the [0:" +
std::to_string((1 << wrap_bits) - 1) + "] range.");
if (strides[3] > (1 << step_bits))

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 196 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
return emitOpError("Stride 3 exceeds the [1:" +
std::to_string(1 << step_bits) + "] range.");
if (strides[2] > (1 << step_bits))

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 199 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
return emitOpError("Stride 2 exceeds the [1:" +
std::to_string(1 << step_bits) + "] range.");
if (strides[1] > (1 << step_bits))

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 202 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
return emitOpError("Stride 1 exceeds the [1:" +
std::to_string(1 << step_bits) + "] range.");

if (offset % 4 != 0) {
return emitOpError("Offset must be 4-byte-aligned.");
}

bool error = false;
std::stringstream msg;
for (int i = 0; i < 3; i++) {
for (int i = 0; i < 4; i++) {
// strides[0] == 1 is ok iff the tranfer size is a multiple of
// addressGranularity, which is checked below
if (i == 0 && raw_strides[i] == 1)
continue;
if (raw_strides[i] * elemWidth % addressGranularity != 0) {
error = true;
std::stringstream msg;
msg << "Stride " << i << " is " << raw_strides[i] << " elements * "
<< (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8)
<< " bytes, which is not divisible by " << (addressGranularity / 8)
<< ". ";
return emitOpError(msg.str());
}
}
if (error) {
return emitOpError(msg.str());
}

if (raw_sizes[0] * elemWidth % addressGranularity != 0) {
std::stringstream msg;
Expand Down
23 changes: 12 additions & 11 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
auto issue_token = BoolAttr::get(ctx, false);
auto repeat_count = zero;

llvm::SmallVector<int64_t, 3> strides = op.getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> strides = op.getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> sizes = op.getSizesInAddressGranularity();
int64_t offset = op.getOffsetInBytes();

Expand Down Expand Up @@ -260,33 +260,34 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
// packet_type

// d0_size
if (strides[0])
if (strides[1])
d0_size = IntegerAttr::get(i32ty, sizes[0]);

// d0_stride
d0_stride = IntegerAttr::get(i32ty, 0);
if (strides[0])
d0_stride = IntegerAttr::get(i32ty, strides[0] - 1);

// d1_size
if (strides[1])
if (strides[2])
d1_size = IntegerAttr::get(i32ty, sizes[1]);

// d1_stride
if (strides[0])
d1_stride = IntegerAttr::get(i32ty, strides[0] - 1);
if (strides[1])
d1_stride = IntegerAttr::get(i32ty, strides[1] - 1);

// d2_stride
if (strides[1])
d2_stride = IntegerAttr::get(i32ty, strides[1] - 1);
if (strides[2])
d2_stride = IntegerAttr::get(i32ty, strides[2] - 1);

// iteration_current

// iteration_size
if (strides[2])
if (strides[3])
iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1);

// iteration_stride
if (strides[2])
iteration_stride = IntegerAttr::get(i32ty, strides[2] - 1);
if (strides[3])
iteration_stride = IntegerAttr::get(i32ty, strides[3] - 1);

// next_bd

Expand Down
14 changes: 9 additions & 5 deletions lib/Targets/AIETargetHSA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
bool isMM2S = channelDir == AIE::DMAChannelDir::MM2S;
int col = infoOp->getCol();

llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
llvm::SmallVector<int64_t, 4> strides = llvm::map_to_vector(
llvm::reverse(op.getMixedStrides()),
[](OpFoldResult s) { return getConstantIntValue(s).value(); });
::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
Expand Down Expand Up @@ -171,6 +171,10 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
}
}

if (strides[0] != 1)
return module.emitOpError("nd_memcpy inner-dimension stride != 1 is "
"unsupported by HSA target");

// Writing the packet information to perform the DMA
output << "\thsa_agent_dispatch_packet_t pkt" << op_count << " ;\n";
output << "\twr_idx = hsa_queue_add_write_index_relaxed(q, 1);\n";
Expand All @@ -181,10 +185,10 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
<< "/* channel */, 4 /* Burst length */, 2 /* Memory space */, "
"(uint64_t)buf"
<< arg_idx << " + " << offset << " /* Address */, " << sizes[0] * 4
<< " /* 1d_length */, " << (strides[0] ? sizes[1] : 1)
<< " /* 2d_length */, " << (strides[0] ? strides[0] * 4 : 0)
<< " /* 2d_stride */, " << (strides[1] ? sizes[2] : 1)
<< " /* 3d_length */, " << (strides[1] ? strides[1] * 4 : 0)
<< " /* 1d_length */, " << (strides[1] ? sizes[1] : 1)
<< " /* 2d_length */, " << (strides[1] ? strides[1] * 4 : 0)
<< " /* 2d_stride */, " << (strides[2] ? sizes[2] : 1)
<< " /* 3d_length */, " << (strides[2] ? strides[2] * 4 : 0)
<< " /* 3d_stride */ , 1 /* 4d_length */, 0 /* 4d_stride */);\n";

bool last_op = op_count == (num_ops - 1);
Expand Down
6 changes: 5 additions & 1 deletion programming_examples/basic/dma_transpose/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ def sequence(A, B, C):
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
npu_dma_memcpy_nd(
metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
metadata="in",
bd_id=1,
mem=A,
sizes=[1, K, M, 1],
strides=[1, 1, K, 1],
)
npu_sync(column=0, row=0, direction=0, channel=0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def sequence(A, B, C):
bd_id=2,
mem=B,
sizes=[M_div_m_div_n_cores, 1, 1, K],
strides=[0, 0, 0],
strides=[0, 0, 0, 1],
)
for i in range(n_cores):
A_offset = i * M_div_m_div_n_cores * m * K
Expand All @@ -207,15 +207,15 @@ def sequence(A, B, C):
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[M_div_m_div_n_cores, K_div_k, m, k],
strides=[m_x_K, k, K],
strides=[m_x_K, k, K, 1],
)
npu_dma_memcpy_nd(
metadata=outC_fifo_names[i],
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[1, 1, 1, C_sz_div_n_cores],
strides=[0, 0, 0],
strides=[0, 0, 0, 1],
)

for i in range(n_cores):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def sequence(A, B, C):
mem=C,
offsets=[0, 0, 0, C_row_offset],
sizes=[num_tile_rows, N_div_n, m, n],
strides=[m_x_N, n, N],
strides=[m_x_N, n, N, 1],
)
for tile_row in range(num_tile_rows):
A_row_offset = (
Expand All @@ -238,14 +238,14 @@ def sequence(A, B, C):
mem=A,
offsets=[0, 0, 0, A_row_offset],
sizes=[N_div_n, K_div_k, m, k],
strides=[0, k, K],
strides=[0, k, K, 1],
)
npu_dma_memcpy_nd(
metadata="inB",
bd_id=2 * tile_row + 2,
mem=B,
sizes=[N_div_n, K_div_k, k, n],
strides=[n, k_x_N, N],
strides=[n, k_x_N, N, 1],
)

npu_sync(column=0, row=0, direction=0, channel=0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def sequence(A, B, C):
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n],
strides=[m * n_rows * N, n * n_cols, N],
strides=[m * n_rows * N, n * n_cols, N, 1],
)
for tile_row in range(num_tile_rows):
A_row_offset = (
Expand All @@ -342,15 +342,15 @@ def sequence(A, B, C):
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[N // n // n_cols, K // k, m, k],
strides=[0, k, K],
strides=[0, k, K, 1],
)
npu_dma_memcpy_nd(
metadata=inB_fifo_names[i],
bd_id=2 * tile_row + 2,
mem=B,
offsets=[0, 0, 0, B_col_offset],
sizes=[N // n // n_cols, K // k, k, n],
strides=[n * n_cols, k * N, N],
strides=[n * n_cols, k * N, N, 1],
)
for i in range(n_cols):
npu_sync(column=i, row=0, direction=0, channel=0)
Expand Down
4 changes: 2 additions & 2 deletions programming_examples/basic/matrix_scalar_add/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ def sequence(inTensor, notUsed, outTensor):
bd_id=0,
mem=outTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
)
npu_dma_memcpy_nd(
metadata="in0",
bd_id=1,
mem=inTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
)
npu_sync(column=0, row=0, direction=0, channel=0)

Expand Down
10 changes: 5 additions & 5 deletions programming_examples/ml/resnet/layers_conv2_x/aie.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -999,11 +999,11 @@ aie.device(npu1_3col) {
%total_wts_3_off = arith.constant 35840 : i64

//dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0, 1]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0, 1]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>

aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ module @passThroughLine_aie2 {
%tilewidth = arith.constant 480 : i64 // in 32b words so tileWidth/4

//dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ module @passThroughLine_aie2 {
%totalLenRGBA = arith.constant 2073600 : i64

//dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
Expand Down
Loading

0 comments on commit cd33847

Please sign in to comment.