Offset and validation fixes for npu_memcpy_nd (#1580)

Xilinx · Jun 27, 2024 · 54efffa · 54efffa
1 parent 8b30632
commit 54efffa
Show file tree

Hide file tree

Showing 9 changed files with 512 additions and 47 deletions.
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -509,6 +509,32 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   let extraClassDeclaration = [{
     static unsigned getOffsetSizeAndStrideStartOperandIndex();
     static std::array<unsigned, 3> getArrayAttrMaxRanks();
+
+    /* Returns the provided multi-dimensional data transfer strides in units of
+       address granularity. In the IR, we express strides in units of element
+       data type, but the hardware requires it in units of address granularity.
+       Address granularity currently is 4 bytes for all hardware.
+
+       The returned stride[0] is the second-lowest dimension stride, i.e.
+       stride 1. The lowest stride is currently implicitly one, but this is not
+       a hardware requirement and could be changed in the future.  */
+    llvm::SmallVector<int64_t, 3> getStridesInAddressGranularity();
+
+    /* Returns the multi-dimensional data transfer sizes in units of address
+       granularity. These sizes are expressed in units of element data type in
+       the IR, but the hardware requires them to be in units of address
+       granularity. Address granularity currently is 4 bytes for all hardware. 
+
+       The returned size[0] is the lowest dimension size. In the IR, the sizes
+       are given in reverse order. For example, specifying sizes in IR as
+       [1, 2, 3, 4] would result in this function returning [4, 3, 2, 1].
+       */
+    llvm::SmallVector<int64_t, 4> getSizesInAddressGranularity();
+
+    /* Returns the data transfer offset in bytes, i.e. the first N bytes of the
+       target buffer will be skipped. In the IR, offsets are expressed in units
+       of memref element data type size. */
+    int64_t getOffsetInBytes(); 
   }];
 
   let extraClassDefinition = [{

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -64,10 +64,69 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
   return success();
 }
 
+llvm::SmallVector<int64_t, 3>
+AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
+  const auto &targetModel = AIE::getTargetModel(*this);
+  MemRefType buffer = getMemref().getType();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+  auto addressGranularity = targetModel.getAddressGenGranularity();
+  llvm::SmallVector<int64_t, 3> strides =
+      llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  if (!strides.empty()) {
+    for (int i = 0; i < 3; i++) {
+      strides[i] = (strides[i] * elemWidth) / addressGranularity;
+    }
+  }
+  return strides;
+}
+
+llvm::SmallVector<int64_t, 4>
+AIEX::NpuDmaMemcpyNdOp::getSizesInAddressGranularity() {
+  const auto &targetModel = AIE::getTargetModel(*this);
+  MemRefType buffer = getMemref().getType();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+  auto addressGranularity = targetModel.getAddressGenGranularity();
+  llvm::SmallVector<int64_t, 4> sizes =
+      llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  if (!sizes.empty()) {
+    sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
+  }
+  return sizes;
+}
+
+/* Calculates the offset value to be written to the
+ */
+int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
+  llvm::SmallVector<int64_t, 4> offsets =
+      llvm::map_to_vector(llvm::reverse(getMixedOffsets()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  size_t stride = 1;
+  size_t offset = 0;
+  MemRefType my_memref = getMemref().getType();
+  auto shape = my_memref.getShape();
+  size_t R = shape.size();
+  size_t el_bit_width = my_memref.getElementTypeBitWidth();
+  assert(el_bit_width % 8 == 0 &&
+         "Expected Memref element bitwidth to be multiple of 8.");
+  size_t S = el_bit_width / 8;
+  for (size_t i = 0; i < R; i++) {
+    offset += offsets[i] * stride * S;
+    stride *= shape[R - i - 1];
+  }
+  return offset;
+}
+
 LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   MemRefType buffer = getMemref().getType();
   const auto &targetModel = AIE::getTargetModel(*this);
   auto addressGranularity = targetModel.getAddressGenGranularity();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+
   if (buffer.getElementTypeBitWidth() > addressGranularity) {
     return emitOpError("Maximum element bit width allowed is ")
            << addressGranularity << "bits. ";
@@ -79,25 +138,29 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant strides currently supported.");
+    return emitOpError("Only constant strides currently supported.");
   if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant sizes currently supported.");
+    return emitOpError("Only constant sizes currently supported.");
   if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant offsets currently supported.");
+    return emitOpError("Only constant offsets currently supported.");
 
-  llvm::SmallVector<int64_t, 3> strides =
+  llvm::SmallVector<int64_t, 3> raw_strides =
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
-  llvm::SmallVector<int64_t, 4> sizes =
+  llvm::SmallVector<int64_t, 4> raw_sizes =
       llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
 
+  llvm::SmallVector<int64_t, 3> strides = getStridesInAddressGranularity();
+  llvm::SmallVector<int64_t, 4> sizes = getSizesInAddressGranularity();
+  int64_t offset = getOffsetInBytes();
+
   if (sizes[3] > 64)
     return emitOpError("Size 3 exceeds the [1:64] range.");
   if (strides[1] && sizes[1] > 0x3FF)
@@ -110,6 +173,36 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
     return emitOpError("Stride 2 exceeds the [1:1M] range.");
   if (strides[0] > 0x100000)
     return emitOpError("Stride 1 exceeds the [1:1M] range.");
+
+  if (offset % 4 != 0) {
+    return emitOpError("Offset must be 4-byte-aligned.");
+  }
+
+  bool error = false;
+  std::stringstream msg;
+  for (int i = 0; i < 3; i++) {
+    if (raw_strides[i] * elemWidth % addressGranularity != 0) {
+      error = true;
+      msg << "Stride " << i << " is " << raw_strides[i] << " elements * "
+          << (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8)
+          << " bytes, which is not divisible by " << (addressGranularity / 8)
+          << ". ";
+    }
+  }
+  if (error) {
+    return emitOpError(msg.str());
+  }
+
+  if (raw_sizes[0] * elemWidth % addressGranularity != 0) {
+    std::stringstream msg;
+    msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
+        << " bytes. " << raw_sizes[0] << " elements at " << (elemWidth / 8)
+        << " bytes each equal " << (raw_sizes[0] * elemWidth / 8)
+        << " bytes, which is not divisible by " << (addressGranularity / 8)
+        << ". ";
+    return emitOpError(msg.str());
+  }
+
   return success();
 }
 

diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -219,31 +219,9 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
 
-    llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
-        llvm::reverse(op.getMixedStrides()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
-        llvm::reverse(op.getMixedSizes()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> offsets = llvm::map_to_vector(
-        llvm::reverse(op.getMixedOffsets()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-
-    MemRefType buffer = op.getMemref().getType();
-    const auto &targetModel = AIE::getTargetModel(op);
-    auto elemWidth = buffer.getElementTypeBitWidth();
-    auto addressGranularity = targetModel.getAddressGenGranularity();
-    if (elemWidth < addressGranularity) {
-      if (!strides.empty()) {
-        for (int i = 0; i < 3; i++) {
-          strides[i] = (strides[i] * elemWidth) / addressGranularity;
-        }
-      }
-      if (!sizes.empty())
-        sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
-      if (!offsets.empty())
-        offsets[0] = (offsets[0] * elemWidth) / addressGranularity;
-    }
+    llvm::SmallVector<int64_t, 3> strides = op.getStridesInAddressGranularity();
+    llvm::SmallVector<int64_t, 4> sizes = op.getSizesInAddressGranularity();
+    int64_t offset = op.getOffsetInBytes();
 
     // column
     column = IntegerAttr::get(i32ty, col);
@@ -271,19 +249,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     buffer_length = IntegerAttr::get(i32ty, repeat_length);
 
     // buffer_offset
-    size_t stride = 1;
-    size_t offset = 0;
-    MemRefType my_memref = op.getMemref().getType();
-    auto shape = my_memref.getShape();
-    size_t R = shape.size();
-    size_t el_bit_width = my_memref.getElementTypeBitWidth();
-    assert(el_bit_width % 8 == 0 &&
-           "Expected Memref element bitwidth to be multiple of 8.");
-    size_t S = el_bit_width / 8;
-    for (size_t i = 0; i < R; i++) {
-      offset += offsets[i] * stride * S;
-      stride *= shape[R - i - 1];
-    }
     buffer_offset = IntegerAttr::get(i32ty, offset);
 
     // enable_packet

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -231,7 +231,7 @@ def sequence(A, B, C):
                     for tile_row in range(num_tile_rows):
                         A_row_offset = (
                             ((tile_row_block * rows_per_block) + tile_row) * m * K
-                        ) * 2
+                        )
                         npu_dma_memcpy_nd(
                             metadata="inA",
                             bd_id=2 * tile_row + 1,

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -317,7 +317,7 @@ def sequence(A, B, C):
                     C_row_offset = tile_row_block * rows_per_block * m * n_rows * N
                     for i in range(n_cols):
                         C_col_offset = i * n
-                        C_offset = (C_col_offset + C_row_offset) * 2
+                        C_offset = C_col_offset + C_row_offset
                         npu_dma_memcpy_nd(
                             metadata=outC_fifo_names[i],
                             bd_id=0,
@@ -334,8 +334,8 @@ def sequence(A, B, C):
                                 * K
                             )
                             A_col_offset = i * m * K
-                            A_offset = (A_row_offset + A_col_offset) * 2
-                            B_col_offset = i * n * 2
+                            A_offset = A_row_offset + A_col_offset
+                            B_col_offset = i * n
                             npu_dma_memcpy_nd(
                                 metadata=inA_fifo_names[i],
                                 bd_id=2 * tile_row + 1,

diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir
@@ -66,3 +66,103 @@ module {
 
 // -----
 
+// Offsets need to be 4-byte aligned.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd_stride(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c8 = arith.constant 8 : i64
+      // expected-error@+1 {{Offset must be 4-byte-aligned}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @fifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+// Strides and sizes expressed in types other than i32 should not overflow hardware limitations when converted to 4-byte granularity.
+// The following tests check this.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64 
+      %c8 = arith.constant 8 : i64
+      %c2048 = arith.constant 2048 : i64
+      // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s,
+      // this should be a size of 512 in address granularity (4 bytes) and hence pass the test.
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi16>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64
+      %c8 = arith.constant 8 : i64
+      %c2048 = arith.constant 2048 : i64
+      // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+// Strides and sizes are expressed at 4-byte-granularity in hardware, but we express them at memref element type granularity.
+// The following tests make sure the proper errors are generated when this is not possible.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64  // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible
+      %c8 = arith.constant 8 : i64
+      %c1920 = arith.constant 1920 : i64
+      %c1080 = arith.constant 1080 : i64
+      // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64
+      %c8 = arith.constant 8 : i64
+      %c1920 = arith.constant 1920 : i64
+      %c1080 = arith.constant 1080 : i64
+      // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}