Make aiex.npu.dma_memcpy_nd d0 stride explicit (#1586)

Xilinx · Jul 2, 2024 · cd33847 · cd33847
1 parent a764c8b
commit cd33847
Show file tree

Hide file tree

Showing 41 changed files with 246 additions and 153 deletions.
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -492,7 +492,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
         Variadic<I64>:$strides,
         ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_offsets,
         ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_sizes,
-        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<3>]>:$static_strides,
+        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_strides,
         FlatSymbolRefAttr:$metadata,
         I64Attr:$id,
         DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token
@@ -518,7 +518,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
        The returned stride[0] is the second-lowest dimension stride, i.e.
        stride 1. The lowest stride is currently implicitly one, but this is not
        a hardware requirement and could be changed in the future.  */
-    llvm::SmallVector<int64_t, 3> getStridesInAddressGranularity();
+    llvm::SmallVector<int64_t, 4> getStridesInAddressGranularity();
 
     /* Returns the multi-dimensional data transfer sizes in units of address
        granularity. These sizes are expressed in units of element data type in
@@ -539,7 +539,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
 
   let extraClassDefinition = [{
     unsigned $cppClass::getOffsetSizeAndStrideStartOperandIndex() { return 1; }
-    std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {4, 4, 3}; }
+    std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {4, 4, 4}; }
   }];
 
   let hasVerifier = 1;
@@ -556,7 +556,7 @@ def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> {
       ...
       aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
       ...
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
       ...
       aiex.npu.dma_wait { symbol = @out0 }
     ```

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -64,18 +64,18 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
   return success();
 }
 
-llvm::SmallVector<int64_t, 3>
+llvm::SmallVector<int64_t, 4>
 AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
   const auto &targetModel = AIE::getTargetModel(*this);
   MemRefType buffer = getMemref().getType();
   auto elemWidth = buffer.getElementTypeBitWidth();
   auto addressGranularity = targetModel.getAddressGenGranularity();
-  llvm::SmallVector<int64_t, 3> strides =
+  llvm::SmallVector<int64_t, 4> strides =
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
   if (!strides.empty()) {
-    for (int i = 0; i < 3; i++) {
+    for (int i = 0; i < 4; i++) {
       strides[i] = (strides[i] * elemWidth) / addressGranularity;
     }
   }
@@ -148,7 +148,7 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
       }))
     return emitOpError("Only constant offsets currently supported.");
 
-  llvm::SmallVector<int64_t, 3> raw_strides =
+  llvm::SmallVector<int64_t, 4> raw_strides =
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
@@ -157,41 +157,70 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
         return getConstantIntValue(s).value();
       });
 
-  llvm::SmallVector<int64_t, 3> strides = getStridesInAddressGranularity();
+  llvm::SmallVector<int64_t, 4> strides = getStridesInAddressGranularity();
   llvm::SmallVector<int64_t, 4> sizes = getSizesInAddressGranularity();
   int64_t offset = getOffsetInBytes();
 
-  if (sizes[3] > 64)
-    return emitOpError("Size 3 exceeds the [1:64] range.");
-  if (strides[1] && sizes[1] > 0x3FF)
-    return emitOpError("Size 1 exceeds the [0:1023] range.");
-  if (strides[0] && sizes[0] > 0x3FF)
-    return emitOpError("Size 0 exceeds the [0:1023] range.");
-  if (strides[2] > 0x100000)
-    return emitOpError("Stride 3 exceeds the [1:1M] range.");
-  if (strides[1] > 0x100000)
-    return emitOpError("Stride 2 exceeds the [1:1M] range.");
-  if (strides[0] > 0x100000)
-    return emitOpError("Stride 1 exceeds the [1:1M] range.");
+  // The experimental HSA target uses this op on AIE1, skip all the AIE2
+  // specific checks
+  if (targetModel.getTargetArch() == AIE::AIEArch::AIE1)
+    return success();
+
+  uint32_t wrap_bits = 0;
+  uint32_t step_bits = 0;
+  uint32_t iter_bits = 6;
+  if (targetModel.isShimNOCTile(getX(), getY())) {
+    step_bits = 20; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_STEPSIZE_WIDTH
+    wrap_bits = 10; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
+  } else if (targetModel.isMemTile(getX(), getY())) {
+    step_bits = 17; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
+    wrap_bits = 10; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_WRAP_WIDTH
+  } else if (targetModel.isCoreTile(getX(), getY())) {
+    step_bits = 13; // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
+    wrap_bits = 8;  // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
+  } else {
+    return emitOpError("Unsupported tile type at (" + std::to_string(getX()) +
+                       ", " + std::to_string(getY()) +
+                       ") Must be ShimNOC, Mem or Core.");
+  }
+
+  if (sizes[3] > (1 << iter_bits))
+    return emitOpError(
+        "Size 3 exceeds the [1:" + std::to_string(1 << iter_bits) + "] range.");
+  if (strides[2] && sizes[1] > (1 << wrap_bits) - 1)
+    return emitOpError("Size 1 exceeds the [0:" +
+                       std::to_string((1 << wrap_bits) - 1) + "] range.");
+  if (strides[1] && sizes[0] > (1 << wrap_bits) - 1)
+    return emitOpError("Size 0 exceeds the [0:" +
+                       std::to_string((1 << wrap_bits) - 1) + "] range.");
+  if (strides[3] > (1 << step_bits))
+    return emitOpError("Stride 3 exceeds the [1:" +
+                       std::to_string(1 << step_bits) + "] range.");
+  if (strides[2] > (1 << step_bits))
+    return emitOpError("Stride 2 exceeds the [1:" +
+                       std::to_string(1 << step_bits) + "] range.");
+  if (strides[1] > (1 << step_bits))
+    return emitOpError("Stride 1 exceeds the [1:" +
+                       std::to_string(1 << step_bits) + "] range.");
 
   if (offset % 4 != 0) {
     return emitOpError("Offset must be 4-byte-aligned.");
   }
 
-  bool error = false;
-  std::stringstream msg;
-  for (int i = 0; i < 3; i++) {
+  for (int i = 0; i < 4; i++) {
+    // strides[0] == 1 is ok iff the tranfer size is a multiple of
+    // addressGranularity, which is checked below
+    if (i == 0 && raw_strides[i] == 1)
+      continue;
     if (raw_strides[i] * elemWidth % addressGranularity != 0) {
-      error = true;
+      std::stringstream msg;
       msg << "Stride " << i << " is " << raw_strides[i] << " elements * "
           << (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8)
           << " bytes, which is not divisible by " << (addressGranularity / 8)
           << ". ";
+      return emitOpError(msg.str());
     }
   }
-  if (error) {
-    return emitOpError(msg.str());
-  }
 
   if (raw_sizes[0] * elemWidth % addressGranularity != 0) {
     std::stringstream msg;

diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -219,7 +219,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
 
-    llvm::SmallVector<int64_t, 3> strides = op.getStridesInAddressGranularity();
+    llvm::SmallVector<int64_t, 4> strides = op.getStridesInAddressGranularity();
     llvm::SmallVector<int64_t, 4> sizes = op.getSizesInAddressGranularity();
     int64_t offset = op.getOffsetInBytes();
 
@@ -260,33 +260,34 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // packet_type
 
     // d0_size
-    if (strides[0])
+    if (strides[1])
       d0_size = IntegerAttr::get(i32ty, sizes[0]);
 
     // d0_stride
-    d0_stride = IntegerAttr::get(i32ty, 0);
+    if (strides[0])
+      d0_stride = IntegerAttr::get(i32ty, strides[0] - 1);
 
     // d1_size
-    if (strides[1])
+    if (strides[2])
       d1_size = IntegerAttr::get(i32ty, sizes[1]);
 
     // d1_stride
-    if (strides[0])
-      d1_stride = IntegerAttr::get(i32ty, strides[0] - 1);
+    if (strides[1])
+      d1_stride = IntegerAttr::get(i32ty, strides[1] - 1);
 
     // d2_stride
-    if (strides[1])
-      d2_stride = IntegerAttr::get(i32ty, strides[1] - 1);
+    if (strides[2])
+      d2_stride = IntegerAttr::get(i32ty, strides[2] - 1);
 
     // iteration_current
 
     // iteration_size
-    if (strides[2])
+    if (strides[3])
       iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1);
 
     // iteration_stride
-    if (strides[2])
-      iteration_stride = IntegerAttr::get(i32ty, strides[2] - 1);
+    if (strides[3])
+      iteration_stride = IntegerAttr::get(i32ty, strides[3] - 1);
 
     // next_bd
 

diff --git a/lib/Targets/AIETargetHSA.cpp b/lib/Targets/AIETargetHSA.cpp
@@ -135,7 +135,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
     bool isMM2S = channelDir == AIE::DMAChannelDir::MM2S;
     int col = infoOp->getCol();
 
-    llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
+    llvm::SmallVector<int64_t, 4> strides = llvm::map_to_vector(
         llvm::reverse(op.getMixedStrides()),
         [](OpFoldResult s) { return getConstantIntValue(s).value(); });
     ::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
@@ -171,6 +171,10 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
       }
     }
 
+    if (strides[0] != 1)
+      return module.emitOpError("nd_memcpy inner-dimension stride != 1 is "
+                                "unsupported by HSA target");
+
     // Writing the packet information to perform the DMA
     output << "\thsa_agent_dispatch_packet_t pkt" << op_count << " ;\n";
     output << "\twr_idx  = hsa_queue_add_write_index_relaxed(q, 1);\n";
@@ -181,10 +185,10 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
            << "/* channel */, 4 /* Burst length */, 2 /* Memory space */, "
               "(uint64_t)buf"
            << arg_idx << " + " << offset << " /* Address */, " << sizes[0] * 4
-           << " /* 1d_length */, " << (strides[0] ? sizes[1] : 1)
-           << " /* 2d_length */, " << (strides[0] ? strides[0] * 4 : 0)
-           << " /* 2d_stride */, " << (strides[1] ? sizes[2] : 1)
-           << " /* 3d_length */, " << (strides[1] ? strides[1] * 4 : 0)
+           << " /* 1d_length */, " << (strides[1] ? sizes[1] : 1)
+           << " /* 2d_length */, " << (strides[1] ? strides[1] * 4 : 0)
+           << " /* 2d_stride */, " << (strides[2] ? sizes[2] : 1)
+           << " /* 3d_length */, " << (strides[2] ? strides[2] * 4 : 0)
            << " /* 3d_stride */ , 1 /* 4d_length */, 0 /* 4d_stride */);\n";
 
     bool last_op = op_count == (num_ops - 1);

diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
@@ -57,7 +57,11 @@ def sequence(A, B, C):
                 # The strides below are configured to read across all rows in the same column
                 # Stride of K in dim/wrap 2 skips an entire row to read a full column
                 npu_dma_memcpy_nd(
-                    metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
+                    metadata="in",
+                    bd_id=1,
+                    mem=A,
+                    sizes=[1, K, M, 1],
+                    strides=[1, 1, K, 1],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 

diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -196,7 +196,7 @@ def sequence(A, B, C):
                     bd_id=2,
                     mem=B,
                     sizes=[M_div_m_div_n_cores, 1, 1, K],
-                    strides=[0, 0, 0],
+                    strides=[0, 0, 0, 1],
                 )
                 for i in range(n_cores):
                     A_offset = i * M_div_m_div_n_cores * m * K
@@ -207,15 +207,15 @@ def sequence(A, B, C):
                         mem=A,
                         offsets=[0, 0, 0, A_offset],
                         sizes=[M_div_m_div_n_cores, K_div_k, m, k],
-                        strides=[m_x_K, k, K],
+                        strides=[m_x_K, k, K, 1],
                     )
                     npu_dma_memcpy_nd(
                         metadata=outC_fifo_names[i],
                         bd_id=0,
                         mem=C,
                         offsets=[0, 0, 0, C_offset],
                         sizes=[1, 1, 1, C_sz_div_n_cores],
-                        strides=[0, 0, 0],
+                        strides=[0, 0, 0, 1],
                     )
 
                 for i in range(n_cores):

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -226,7 +226,7 @@ def sequence(A, B, C):
                         mem=C,
                         offsets=[0, 0, 0, C_row_offset],
                         sizes=[num_tile_rows, N_div_n, m, n],
-                        strides=[m_x_N, n, N],
+                        strides=[m_x_N, n, N, 1],
                     )
                     for tile_row in range(num_tile_rows):
                         A_row_offset = (
@@ -238,14 +238,14 @@ def sequence(A, B, C):
                             mem=A,
                             offsets=[0, 0, 0, A_row_offset],
                             sizes=[N_div_n, K_div_k, m, k],
-                            strides=[0, k, K],
+                            strides=[0, k, K, 1],
                         )
                         npu_dma_memcpy_nd(
                             metadata="inB",
                             bd_id=2 * tile_row + 2,
                             mem=B,
                             sizes=[N_div_n, K_div_k, k, n],
-                            strides=[n, k_x_N, N],
+                            strides=[n, k_x_N, N, 1],
                         )
 
                     npu_sync(column=0, row=0, direction=0, channel=0)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -324,7 +324,7 @@ def sequence(A, B, C):
                             mem=C,
                             offsets=[0, 0, 0, C_offset],
                             sizes=[num_tile_rows, N // n // n_cols, m * n_rows, n],
-                            strides=[m * n_rows * N, n * n_cols, N],
+                            strides=[m * n_rows * N, n * n_cols, N, 1],
                         )
                         for tile_row in range(num_tile_rows):
                             A_row_offset = (
@@ -342,15 +342,15 @@ def sequence(A, B, C):
                                 mem=A,
                                 offsets=[0, 0, 0, A_offset],
                                 sizes=[N // n // n_cols, K // k, m, k],
-                                strides=[0, k, K],
+                                strides=[0, k, K, 1],
                             )
                             npu_dma_memcpy_nd(
                                 metadata=inB_fifo_names[i],
                                 bd_id=2 * tile_row + 2,
                                 mem=B,
                                 offsets=[0, 0, 0, B_col_offset],
                                 sizes=[N // n // n_cols, K // k, k, n],
-                                strides=[n * n_cols, k * N, N],
+                                strides=[n * n_cols, k * N, N, 1],
                             )
                     for i in range(n_cols):
                         npu_sync(column=i, row=0, direction=0, channel=0)

diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -85,14 +85,14 @@ def sequence(inTensor, notUsed, outTensor):
                 bd_id=0,
                 mem=outTensor,
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH, 1],
             )
             npu_dma_memcpy_nd(
                 metadata="in0",
                 bd_id=1,
                 mem=inTensor,
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH, 1],
             )
             npu_sync(column=0, row=0, direction=0, channel=0)
 

diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -999,11 +999,11 @@ aie.device(npu1_3col) {
       %total_wts_3_off = arith.constant  35840 : i64
 
       //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-      aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0, 1]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0, 1]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
 
       aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return

diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
@@ -53,8 +53,8 @@ module @passThroughLine_aie2 {
             %tilewidth  = arith.constant 480 : i64  // in 32b words so tileWidth/4
 
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
-            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }

diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
@@ -54,8 +54,8 @@ module @passThroughLine_aie2 {
             %totalLenRGBA = arith.constant 2073600 : i64
 
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
-            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }