Skip to content

Commit

Permalink
Offset and validation fixes for npu_memcpy_nd (#1580)
Browse files Browse the repository at this point in the history
  • Loading branch information
andrej authored Jun 27, 2024
1 parent 8b30632 commit 54efffa
Show file tree
Hide file tree
Showing 9 changed files with 512 additions and 47 deletions.
26 changes: 26 additions & 0 deletions include/aie/Dialect/AIEX/IR/AIEX.td
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,32 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
let extraClassDeclaration = [{
static unsigned getOffsetSizeAndStrideStartOperandIndex();
static std::array<unsigned, 3> getArrayAttrMaxRanks();

/* Returns the provided multi-dimensional data transfer strides in units of
address granularity. In the IR, we express strides in units of element
data type, but the hardware requires it in units of address granularity.
Address granularity currently is 4 bytes for all hardware.

The returned stride[0] is the second-lowest dimension stride, i.e.
stride 1. The lowest stride is currently implicitly one, but this is not
a hardware requirement and could be changed in the future. */
llvm::SmallVector<int64_t, 3> getStridesInAddressGranularity();

/* Returns the multi-dimensional data transfer sizes in units of address
granularity. These sizes are expressed in units of element data type in
the IR, but the hardware requires them to be in units of address
granularity. Address granularity currently is 4 bytes for all hardware.

The returned size[0] is the lowest dimension size. In the IR, the sizes
are given in reverse order. For example, specifying sizes in IR as
[1, 2, 3, 4] would result in this function returning [4, 3, 2, 1].
*/
llvm::SmallVector<int64_t, 4> getSizesInAddressGranularity();

/* Returns the data transfer offset in bytes, i.e. the first N bytes of the
target buffer will be skipped. In the IR, offsets are expressed in units
of memref element data type size. */
int64_t getOffsetInBytes();
}];

let extraClassDefinition = [{
Expand Down
103 changes: 98 additions & 5 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,69 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
return success();
}

llvm::SmallVector<int64_t, 3>
AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
const auto &targetModel = AIE::getTargetModel(*this);
MemRefType buffer = getMemref().getType();
auto elemWidth = buffer.getElementTypeBitWidth();
auto addressGranularity = targetModel.getAddressGenGranularity();
llvm::SmallVector<int64_t, 3> strides =
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
if (!strides.empty()) {
for (int i = 0; i < 3; i++) {
strides[i] = (strides[i] * elemWidth) / addressGranularity;
}
}
return strides;
}

llvm::SmallVector<int64_t, 4>
AIEX::NpuDmaMemcpyNdOp::getSizesInAddressGranularity() {
const auto &targetModel = AIE::getTargetModel(*this);
MemRefType buffer = getMemref().getType();
auto elemWidth = buffer.getElementTypeBitWidth();
auto addressGranularity = targetModel.getAddressGenGranularity();
llvm::SmallVector<int64_t, 4> sizes =
llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
if (!sizes.empty()) {
sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
}
return sizes;
}

/* Calculates the offset value to be written to the
*/
int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
llvm::SmallVector<int64_t, 4> offsets =
llvm::map_to_vector(llvm::reverse(getMixedOffsets()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
size_t stride = 1;
size_t offset = 0;
MemRefType my_memref = getMemref().getType();
auto shape = my_memref.getShape();
size_t R = shape.size();
size_t el_bit_width = my_memref.getElementTypeBitWidth();
assert(el_bit_width % 8 == 0 &&
"Expected Memref element bitwidth to be multiple of 8.");
size_t S = el_bit_width / 8;
for (size_t i = 0; i < R; i++) {
offset += offsets[i] * stride * S;
stride *= shape[R - i - 1];
}
return offset;
}

LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
MemRefType buffer = getMemref().getType();
const auto &targetModel = AIE::getTargetModel(*this);
auto addressGranularity = targetModel.getAddressGenGranularity();
auto elemWidth = buffer.getElementTypeBitWidth();

if (buffer.getElementTypeBitWidth() > addressGranularity) {
return emitOpError("Maximum element bit width allowed is ")
<< addressGranularity << "bits. ";
Expand All @@ -79,25 +138,29 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
return getConstantIntValue(s).has_value();
}))
llvm::report_fatal_error("Only constant strides currently supported.");
return emitOpError("Only constant strides currently supported.");
if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
return getConstantIntValue(s).has_value();
}))
llvm::report_fatal_error("Only constant sizes currently supported.");
return emitOpError("Only constant sizes currently supported.");
if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
return getConstantIntValue(s).has_value();
}))
llvm::report_fatal_error("Only constant offsets currently supported.");
return emitOpError("Only constant offsets currently supported.");

llvm::SmallVector<int64_t, 3> strides =
llvm::SmallVector<int64_t, 3> raw_strides =
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
llvm::SmallVector<int64_t, 4> sizes =
llvm::SmallVector<int64_t, 4> raw_sizes =
llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});

llvm::SmallVector<int64_t, 3> strides = getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> sizes = getSizesInAddressGranularity();
int64_t offset = getOffsetInBytes();

if (sizes[3] > 64)
return emitOpError("Size 3 exceeds the [1:64] range.");
if (strides[1] && sizes[1] > 0x3FF)
Expand All @@ -110,6 +173,36 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
return emitOpError("Stride 2 exceeds the [1:1M] range.");
if (strides[0] > 0x100000)
return emitOpError("Stride 1 exceeds the [1:1M] range.");

if (offset % 4 != 0) {
return emitOpError("Offset must be 4-byte-aligned.");
}

bool error = false;
std::stringstream msg;
for (int i = 0; i < 3; i++) {
if (raw_strides[i] * elemWidth % addressGranularity != 0) {
error = true;
msg << "Stride " << i << " is " << raw_strides[i] << " elements * "
<< (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8)
<< " bytes, which is not divisible by " << (addressGranularity / 8)
<< ". ";
}
}
if (error) {
return emitOpError(msg.str());
}

if (raw_sizes[0] * elemWidth % addressGranularity != 0) {
std::stringstream msg;
msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
<< " bytes. " << raw_sizes[0] << " elements at " << (elemWidth / 8)
<< " bytes each equal " << (raw_sizes[0] * elemWidth / 8)
<< " bytes, which is not divisible by " << (addressGranularity / 8)
<< ". ";
return emitOpError(msg.str());
}

return success();
}

Expand Down
41 changes: 3 additions & 38 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,31 +219,9 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
auto issue_token = BoolAttr::get(ctx, false);
auto repeat_count = zero;

llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
llvm::reverse(op.getMixedStrides()),
[](OpFoldResult s) { return getConstantIntValue(s).value(); });
llvm::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
llvm::reverse(op.getMixedSizes()),
[](OpFoldResult s) { return getConstantIntValue(s).value(); });
llvm::SmallVector<int64_t, 4> offsets = llvm::map_to_vector(
llvm::reverse(op.getMixedOffsets()),
[](OpFoldResult s) { return getConstantIntValue(s).value(); });

MemRefType buffer = op.getMemref().getType();
const auto &targetModel = AIE::getTargetModel(op);
auto elemWidth = buffer.getElementTypeBitWidth();
auto addressGranularity = targetModel.getAddressGenGranularity();
if (elemWidth < addressGranularity) {
if (!strides.empty()) {
for (int i = 0; i < 3; i++) {
strides[i] = (strides[i] * elemWidth) / addressGranularity;
}
}
if (!sizes.empty())
sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
if (!offsets.empty())
offsets[0] = (offsets[0] * elemWidth) / addressGranularity;
}
llvm::SmallVector<int64_t, 3> strides = op.getStridesInAddressGranularity();
llvm::SmallVector<int64_t, 4> sizes = op.getSizesInAddressGranularity();
int64_t offset = op.getOffsetInBytes();

// column
column = IntegerAttr::get(i32ty, col);
Expand Down Expand Up @@ -271,19 +249,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
buffer_length = IntegerAttr::get(i32ty, repeat_length);

// buffer_offset
size_t stride = 1;
size_t offset = 0;
MemRefType my_memref = op.getMemref().getType();
auto shape = my_memref.getShape();
size_t R = shape.size();
size_t el_bit_width = my_memref.getElementTypeBitWidth();
assert(el_bit_width % 8 == 0 &&
"Expected Memref element bitwidth to be multiple of 8.");
size_t S = el_bit_width / 8;
for (size_t i = 0; i < R; i++) {
offset += offsets[i] * stride * S;
stride *= shape[R - i - 1];
}
buffer_offset = IntegerAttr::get(i32ty, offset);

// enable_packet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def sequence(A, B, C):
for tile_row in range(num_tile_rows):
A_row_offset = (
((tile_row_block * rows_per_block) + tile_row) * m * K
) * 2
)
npu_dma_memcpy_nd(
metadata="inA",
bd_id=2 * tile_row + 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def sequence(A, B, C):
C_row_offset = tile_row_block * rows_per_block * m * n_rows * N
for i in range(n_cols):
C_col_offset = i * n
C_offset = (C_col_offset + C_row_offset) * 2
C_offset = C_col_offset + C_row_offset
npu_dma_memcpy_nd(
metadata=outC_fifo_names[i],
bd_id=0,
Expand All @@ -334,8 +334,8 @@ def sequence(A, B, C):
* K
)
A_col_offset = i * m * K
A_offset = (A_row_offset + A_col_offset) * 2
B_col_offset = i * n * 2
A_offset = A_row_offset + A_col_offset
B_col_offset = i * n
npu_dma_memcpy_nd(
metadata=inA_fifo_names[i],
bd_id=2 * tile_row + 1,
Expand Down
100 changes: 100 additions & 0 deletions test/dialect/AIEX/bad_npu_nd.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,103 @@ module {

// -----

// Offsets need to be 4-byte aligned.

module {
aie.device(npu1_4col) {
func.func @bad_npu_nd_stride(%a : memref<8xi8>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c2 = arith.constant 2 : i64
%c8 = arith.constant 8 : i64
// expected-error@+1 {{Offset must be 4-byte-aligned}}
aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
return
}
aie.shim_dma_allocation @fifo (MM2S, 0, 0)
}
}

// -----

// Strides and sizes expressed in types other than i32 should not overflow hardware limitations when converted to 4-byte granularity.
// The following tests check this.

module {
aie.device(npu1_4col) {
func.func @bad_npu_nd(%a : memref<8xi8>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c2 = arith.constant 2 : i64
%c4 = arith.constant 4 : i64
%c8 = arith.constant 8 : i64
%c2048 = arith.constant 2048 : i64
// Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s,
// this should be a size of 512 in address granularity (4 bytes) and hence pass the test.
aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
return
}
aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
}
}

// -----

module {
aie.device(npu1_4col) {
func.func @bad_npu_nd(%a : memref<8xi16>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c2 = arith.constant 2 : i64
%c4 = arith.constant 4 : i64
%c8 = arith.constant 8 : i64
%c2048 = arith.constant 2048 : i64
// expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
return
}
aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
}
}

// -----

// Strides and sizes are expressed at 4-byte-granularity in hardware, but we express them at memref element type granularity.
// The following tests make sure the proper errors are generated when this is not possible.

module {
aie.device(npu1_4col) {
func.func @bad_npu_nd(%a : memref<8xi8>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c2 = arith.constant 2 : i64 // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible
%c8 = arith.constant 8 : i64
%c1920 = arith.constant 1920 : i64
%c1080 = arith.constant 1080 : i64
// expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
return
}
aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
}
}

// -----

module {
aie.device(npu1_4col) {
func.func @bad_npu_nd(%a : memref<8xi8>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c2 = arith.constant 2 : i64
%c4 = arith.constant 4 : i64
%c8 = arith.constant 8 : i64
%c1920 = arith.constant 1920 : i64
%c1080 = arith.constant 1080 : i64
// expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}}
aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
return
}
aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
}
}
Loading

0 comments on commit 54efffa

Please sign in to comment.