From 0504f7ad3662ce9a2044ea38f3a7f3df9b6708fc Mon Sep 17 00:00:00 2001
From: pvasireddy-amd <pvasired@amd.com>
Date: Tue, 11 Jun 2024 13:18:48 -0600
Subject: [PATCH] Changes to NpuDmaMemcpyNdOp and AIEDmaToNpu to support
 sub-word strides, offsets and sizes (#1538)

Co-authored-by: Joseph Melber <jgmelber@gmail.com>
---
 include/aie-c/TargetModel.h                   |  4 ++
 include/aie/Dialect/AIE/IR/AIETargetModel.h   |  9 +++
 lib/CAPI/TargetModel.cpp                      |  4 ++
 lib/Dialect/AIEX/IR/AIEXDialect.cpp           | 12 +++-
 lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp   | 16 ++++++
 .../matrix_vector/aie2.py                     | 36 +++++-------
 .../matrix_multiplication/single_core/aie2.py | 56 +++++++------------
 .../basic/passthrough_kernel/aie2.py          | 11 ++--
 programming_examples/basic/vector_exp/aie2.py | 15 +----
 .../basic/vector_scalar_mul/aie2.py           | 12 ++--
 programming_examples/ml/bottleneck/aie2.py    | 20 +++----
 programming_examples/ml/conv2d/aie2.py        | 14 ++---
 .../ml/conv2d_fused_relu/aie2.py              | 14 ++---
 programming_examples/ml/eltwise_add/aie2.py   | 18 ++----
 programming_examples/ml/eltwise_mul/aie2.py   | 18 ++----
 programming_examples/ml/relu/aie2.py          | 13 +----
 .../ml/resnet/layers_conv2_x/aie2.py          | 42 +++++++-------
 programming_examples/ml/softmax/aie2.py       | 13 +----
 .../vision/color_detect/aie2_colorDetect.py   | 11 ++--
 .../color_threshold/aie2_colorThreshold.py    |  9 ++-
 .../vision/edge_detect/aie2_edgeDetect.py     |  8 +--
 .../vision/vision_passthrough/aie2.py         |  8 +--
 test/Conversion/DmaToNpu/bad_dma_to_npu.mlir  | 29 ++++++++++
 .../DmaToNpu/bad_dma_to_npu_datatype.mlir     | 29 ++++++++++
 .../DmaToNpu/dma_to_npu_width_conversion.mlir | 40 +++++++++++++
 test/dialect/AIEX/bad_npu_nd.mlir             | 14 -----
 26 files changed, 254 insertions(+), 221 deletions(-)
 create mode 100644 test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
 create mode 100644 test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
 create mode 100644 test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
diff --git a/include/aie-c/TargetModel.h b/include/aie-c/TargetModel.h
index c2c026fd37..59da2f2bf9 100644
--- a/include/aie-c/TargetModel.h
+++ b/include/aie-c/TargetModel.h
@@ -42,6 +42,10 @@ DEFINE_C_API_STRUCT(AieTargetModel, uint64_t);
 
 MLIR_CAPI_EXPORTED AieTargetModel aieGetTargetModel(uint32_t device);
 
+/// Returns the data bus width for the target model.
+MLIR_CAPI_EXPORTED uint32_t
+aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel);
+
 /// Returns the number of columns in the target model.
 MLIR_CAPI_EXPORTED int aieTargetModelColumns(AieTargetModel targetModel);
 
diff --git a/include/aie/Dialect/AIE/IR/AIETargetModel.h b/include/aie/Dialect/AIE/IR/AIETargetModel.h
index e9f5de7680..a6ec03d230 100644
--- a/include/aie/Dialect/AIE/IR/AIETargetModel.h
+++ b/include/aie/Dialect/AIE/IR/AIETargetModel.h
@@ -61,6 +61,9 @@ class AIETargetModel {
   /// Return the target architecture.
   virtual AIEArch getTargetArch() const = 0;
 
+  /// Return the data bus width of the device.
+  virtual uint32_t getAddressGenGranularity() const = 0;
+
   /// Return the number of columns in the device.
   virtual int columns() const = 0;
 
@@ -293,6 +296,8 @@ class AIE2TargetModel : public AIETargetModel {
 
   AIEArch getTargetArch() const override;
 
+  uint32_t getAddressGenGranularity() const override { return 32; }
+
   std::optional<TileID> getMemWest(TileID src) const override;
   std::optional<TileID> getMemEast(TileID src) const override;
   std::optional<TileID> getMemNorth(TileID src) const override;
@@ -352,6 +357,8 @@ class VC1902TargetModel : public AIE1TargetModel {
 public:
   VC1902TargetModel() = default;
 
+  uint32_t getAddressGenGranularity() const override { return 32; }
+
   int columns() const override { return 50; }
 
   int rows() const override { return 9; /* One Shim row and 8 Core rows. */ }
@@ -532,6 +539,8 @@ class VirtualizedNPUTargetModel : public BaseNPUTargetModel {
 public:
   VirtualizedNPUTargetModel(int _cols) : cols(_cols) {}
 
+  uint32_t getAddressGenGranularity() const override { return 32; }
+
   int columns() const override { return cols; }
 
   bool isShimNOCTile(int col, int row) const override { return row == 0; }
diff --git a/lib/CAPI/TargetModel.cpp b/lib/CAPI/TargetModel.cpp
index 9c41828871..bd5b33bd6e 100644
--- a/lib/CAPI/TargetModel.cpp
+++ b/lib/CAPI/TargetModel.cpp
@@ -28,6 +28,10 @@ AieTargetModel aieGetTargetModel(uint32_t device) {
       xilinx::AIE::getTargetModel(static_cast<xilinx::AIE::AIEDevice>(device)));
 }
 
+uint32_t aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel) {
+  return unwrap(targetModel).getAddressGenGranularity();
+}
+
 int aieTargetModelColumns(AieTargetModel targetModel) {
   return unwrap(targetModel).columns();
 }
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 57d5a13c2e..e1102b4fe3 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -66,8 +66,16 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
 
 LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   MemRefType buffer = getMemref().getType();
-  if (buffer.getElementTypeBitWidth() != 32)
-    return emitOpError("must be used with memref type with element width 32.");
+  const auto &targetModel = AIE::getTargetModel(*this);
+  auto addressGranularity = targetModel.getAddressGenGranularity();
+  if (buffer.getElementTypeBitWidth() > addressGranularity) {
+    return emitOpError("Maximum element bit width allowed is ")
+           << addressGranularity << "bits. ";
+  } else if ((buffer.getNumElements() * buffer.getElementTypeBitWidth()) <
+             addressGranularity) {
+    return emitOpError("Minimum data transfer size required is ")
+           << addressGranularity << "bits. ";
+  }
   if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index e5b1332916..95f090776c 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -230,6 +230,22 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
         llvm::reverse(op.getMixedOffsets()),
         [](OpFoldResult s) { return getConstantIntValue(s).value(); });
 
+    MemRefType buffer = op.getMemref().getType();
+    const auto &targetModel = AIE::getTargetModel(op);
+    auto elemWidth = buffer.getElementTypeBitWidth();
+    auto addressGranularity = targetModel.getAddressGenGranularity();
+    if (elemWidth < addressGranularity) {
+      if (!strides.empty()) {
+        for (int i = 0; i < 3; i++) {
+          strides[i] = (strides[i] * elemWidth) / addressGranularity;
+        }
+      }
+      if (!sizes.empty())
+        sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
+      if (!offsets.empty())
+        offsets[0] = (offsets[0] * elemWidth) / addressGranularity;
+    }
+
     // column
     column = IntegerAttr::get(i32ty, col);
 
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
index 6b27d9f9e3..54276121c8 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -17,26 +17,20 @@ def my_matmul():
     K = 288
     m = 32
     k = 32
-    word_size_in = 2
-    word_size_out = 4
 
     n_cores = 1
 
-    A_sz_in_i32s = M * K * word_size_in // 4
-    B_sz_in_i32s = K * word_size_in // 4
-    C_sz_in_bytes = M * word_size_out
-    C_sz_in_i32s = C_sz_in_bytes // 4
-    C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores
+    A_sz = M * K
+    B_sz = K
+    C_sz = M
+    C_sz_div_n_cores = C_sz // n_cores
 
     M_div_m = M // m
     M_div_m_div_n_cores = M // (m * n_cores)
     K_div_k = K // k
 
-    K_in_i32s = K * word_size_in // 4
-    k_in_i32s = k * word_size_in // 4
-    m_in_i32s = m * word_size_in // 4
-    m_x_k_in_i32s = m * k * word_size_in // 4
-    m_x_K_in_i32s = m * K * word_size_in // 4
+    m_x_k = m * k
+    m_x_K = m * K
 
     vectorized = True
 
@@ -172,35 +166,35 @@ def core_body():
             # To/from AIE-array data movement
 
             @FuncOp.from_py_func(
-                T.memref(A_sz_in_i32s, T.i32()),
-                T.memref(B_sz_in_i32s, T.i32()),
-                T.memref(C_sz_in_i32s, T.i32()),
+                T.memref(A_sz, T.bf16()),
+                T.memref(B_sz, T.bf16()),
+                T.memref(C_sz, T.f32()),
             )
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(
                     metadata=inB_fifo_names[0],
                     bd_id=2,
                     mem=B,
-                    sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s],
+                    sizes=[M_div_m_div_n_cores, 1, 1, K],
                     strides=[0, 0, 0],
                 )
                 for i in range(n_cores):
-                    A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4
-                    C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4
+                    A_offset = i * M_div_m_div_n_cores * m * K
+                    C_offset = i * M_div_m_div_n_cores * m
                     npu_dma_memcpy_nd(
                         metadata=memA_fifo_names[i],
                         bd_id=1,
                         mem=A,
                         offsets=[0, 0, 0, A_offset],
-                        sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s],
-                        strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s],
+                        sizes=[M_div_m_div_n_cores, K_div_k, m, k],
+                        strides=[m_x_K, k, K],
                     )
                     npu_dma_memcpy_nd(
                         metadata=outC_fifo_names[i],
                         bd_id=0,
                         mem=C,
                         offsets=[0, 0, 0, C_offset],
-                        sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s],
+                        sizes=[1, 1, 1, C_sz_div_n_cores],
                         strides=[0, 0, 0],
                     )
 
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 909fba0c43..ba312aa417 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -22,36 +22,26 @@ def my_matmul():
     r = 4
     s = 8
     t = 4
-    word_size_in = 2
-    word_size_out = 2
 
     vectorized = True
     enable_tracing = False
     trace_size = 65536
 
-    A_sz_in_i32s = M * K * word_size_in // 4
-    B_sz_in_i32s = K * N * word_size_in // 4
-    C_sz_in_bytes = M * N * word_size_out
-    C_sz_in_i32s = C_sz_in_bytes // 4
+    A_sz = M * K
+    B_sz = K * N
+    C_sz = M * N
+    C_sz_in_bytes = C_sz * 2
 
     M_div_m = M // m
     K_div_k = K // k
     N_div_n = N // n
     tiles = M_div_m * N_div_n
 
-    # Matrix A: MxK, submatrices a: mxk
-    k_in_i32s = k * word_size_in // 4
-    K_in_i32s = K * word_size_in // 4
-
     # Matrix B: KxN, submatrices b: kxn
-    n_in_i32s = n * word_size_in // 4
-    N_in_i32s = N * word_size_in // 4
-    k_x_N_in_i32s = k * N * word_size_in // 4
+    k_x_N = k * N
 
     # Output Matrix C: MxN
-    n_in_i32s_out = n * word_size_out // 4
-    N_in_i32s_out = N * word_size_out // 4
-    m_x_N_in_i32s_out = m * N * word_size_out // 4
+    m_x_N = m * N
 
     with mlir_mod_ctx() as ctx:
 
@@ -169,9 +159,9 @@ def core_body():
             # To/from AIE-array data movement
 
             @FuncOp.from_py_func(
-                T.memref(A_sz_in_i32s, T.i32()),
-                T.memref(B_sz_in_i32s, T.i32()),
-                T.memref(C_sz_in_i32s, T.i32()),
+                T.memref(A_sz, T.bf16()),
+                T.memref(B_sz, T.bf16()),
+                T.memref(C_sz, T.bf16()),
             )
             def sequence(A, B, C):
 
@@ -189,9 +179,7 @@ def sequence(A, B, C):
                 for tile_row_block in range(
                     (M_div_m + rows_per_block - 1) // rows_per_block
                 ):
-                    C_row_offset_in_i32s = (
-                        tile_row_block * rows_per_block * m * N * word_size_out // 4
-                    )
+                    C_row_offset = tile_row_block * rows_per_block * m * N
                     num_tile_rows = min(
                         [rows_per_block, M_div_m - tile_row_block * rows_per_block]
                     )
@@ -199,32 +187,28 @@ def sequence(A, B, C):
                         metadata="outC",
                         bd_id=0,
                         mem=C,
-                        offsets=[0, 0, 0, C_row_offset_in_i32s],
-                        sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out],
-                        strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out],
+                        offsets=[0, 0, 0, C_row_offset],
+                        sizes=[num_tile_rows, N_div_n, m, n],
+                        strides=[m_x_N, n, N],
                     )
                     for tile_row in range(num_tile_rows):
-                        A_row_offset_in_i32s = (
-                            ((tile_row_block * rows_per_block) + tile_row)
-                            * m
-                            * K
-                            * word_size_in
-                            // 4
+                        A_row_offset = (
+                            ((tile_row_block * rows_per_block) + tile_row) * m * K
                         )
                         npu_dma_memcpy_nd(
                             metadata="inA",
                             bd_id=2 * tile_row + 1,
                             mem=A,
-                            offsets=[0, 0, 0, A_row_offset_in_i32s],
-                            sizes=[N_div_n, K_div_k, m, k_in_i32s],
-                            strides=[0, k_in_i32s, K_in_i32s],
+                            offsets=[0, 0, 0, A_row_offset],
+                            sizes=[N_div_n, K_div_k, m, k],
+                            strides=[0, k, K],
                         )
                         npu_dma_memcpy_nd(
                             metadata="inB",
                             bd_id=2 * tile_row + 2,
                             mem=B,
-                            sizes=[N_div_n, K_div_k, k, n_in_i32s],
-                            strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
+                            sizes=[N_div_n, K_div_k, k, n],
+                            strides=[n, k_x_N, N],
                         )
 
                     npu_sync(column=0, row=0, direction=0, channel=0)
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 4fe9a7ed9b..fcd6c84632 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -19,7 +19,6 @@
 def passthroughKernel(vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
-    lineWidthInInt32s = lineWidthInBytes // 4
 
     @device(AIEDevice.npu1_1col)
     def device_body():
@@ -58,9 +57,7 @@ def core_body():
 
         #    print(ctx.module.operation.verify())
 
-        tensorSize = N
-        tensorSizeInInt32s = tensorSize // 4
-        tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+        tensor_ty = T.memref(N, T.ui8())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
         def sequence(inTensor, outTensor, notUsed):
@@ -70,20 +67,20 @@ def sequence(inTensor, outTensor, notUsed):
                     ShimTile,
                     ddr_id=1,
                     size=trace_size,
-                    offset=tensorSize,
+                    offset=N,
                 )
 
             npu_dma_memcpy_nd(
                 metadata="in",
                 bd_id=0,
                 mem=inTensor,
-                sizes=[1, 1, 1, tensorSizeInInt32s],
+                sizes=[1, 1, 1, N],
             )
             npu_dma_memcpy_nd(
                 metadata="out",
                 bd_id=1,
                 mem=outTensor,
-                sizes=[1, 1, 1, tensorSizeInInt32s],
+                sizes=[1, 1, 1, N],
             )
             npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py
index af58a6392b..87c8f33c31 100644
--- a/programming_examples/basic/vector_exp/aie2.py
+++ b/programming_examples/basic/vector_exp/aie2.py
@@ -17,12 +17,7 @@
 # AI Engine structural design function
 def my_eltwise_exp():
 
-    word_size_in = 2
     N = 65536
-    N_in_bytes = N * word_size_in
-
-    A_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
 
     # Tile sizes
     n = 1024
@@ -103,16 +98,12 @@ def core_body():
                     yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i32())
+        tensor_ty = T.memref(N, T.bf16())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
-            npu_dma_memcpy_nd(
-                metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-            )
+            npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index 8d367ced50..b0a957393b 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -17,10 +17,8 @@
 
 
 def my_vector_scalar(vector_size, trace_size):
-    word_size_in = 2
     N = vector_size
-    N_in_i32s = N * word_size_in // 4
-    N_in_bytes = N_in_i32s * 4
+    N_in_bytes = N * 2
     N_div_n = 4  # chop input vector into 4 sub-vectors
     n = N // N_div_n
 
@@ -82,7 +80,7 @@ def core_body():
                 yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N_in_i32s, T.i32())
+        tensor_ty = T.memref(N, T.i16())
         scalar_ty = T.memref(1, T.i32())
 
         @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
@@ -96,10 +94,8 @@ def sequence(A, F, C):
                     size=trace_size,
                     offset=N_in_bytes,
                 )
-            npu_dma_memcpy_nd(
-                metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s]
-            )
-            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
index ebdaf0d1b0..9ee60a3b62 100644
--- a/programming_examples/ml/bottleneck/aie2.py
+++ b/programming_examples/ml/bottleneck/aie2.py
@@ -501,16 +501,16 @@ def core_body():
                     yield_([])
 
             # instruction stream generation
-            activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4
-            acitivationsOutSize32b = activationsInSize32b
-            totalWeightsSize32b = (
+            activationsIn = tensorInW * tensorInH * tensorInC
+            acitivationsOut = activationsIn
+            totalWeights = (
                 tensorL1InC * tensorL1OutC
                 + 3 * 3 * tensorL2InC * tensorL2OutC
                 + tensorL3InC * tensorL3OutC
-            ) // 4
+            )
 
-            activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty)
-            weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty)
+            activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty)
+            weightsInL3_ty = MemRefType.get((totalWeights,), uint8_ty)
 
             @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
             def sequence(inputFromL3, weightsFromL3, outputToL3):
@@ -568,7 +568,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     npu_writebd(
                         bd_id=3,
                         buffer_length=trace_sz_in_i32s,
-                        buffer_offset=acitivationsOutSize32b,
+                        buffer_offset=acitivationsOut,
                         enable_packet=0,
                         out_of_order_id=0,
                         packet_id=0,
@@ -616,19 +616,19 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=inputFromL3,
-                    sizes=[1, 1, 1, activationsInSize32b],
+                    sizes=[1, 1, 1, activationsIn],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=outputToL3,
-                    sizes=[1, 1, 1, acitivationsOutSize32b],
+                    sizes=[1, 1, 1, acitivationsOut],
                 )
                 npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
-                    sizes=[1, 1, 1, totalWeightsSize32b],
+                    sizes=[1, 1, 1, totalWeights],
                 )
 
                 npu_sync(column=0, row=0, direction=0, channel=0)
diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
index 11e92f55c2..b6fb537a26 100644
--- a/programming_examples/ml/conv2d/aie2.py
+++ b/programming_examples/ml/conv2d/aie2.py
@@ -25,14 +25,11 @@
 
 actIn = width * in_channels  # 32*64 = 2048
 bufIn = actIn * 2  # double buffer
-actInInt32s = actIn // 4
 
 weights = in_channels * out_channels
-weightsInInt32s = weights // 4
 
 actOut = width * out_channels  # 32*64 = 2048
 bufOut = actOut * 2  # double buffer
-actOutInt32s = actOut // 4
 
 
 def conv2dk1():
@@ -141,9 +138,8 @@ def core_body():
             # To/from AIE-array data movement
 
             tensorSize = width * height * in_channels
-            tensorSizeInInt32s = tensorSize // 4
-            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
-            memRef_wts_ty = T.memref(weightsInInt32s, T.i32())
+            tensor_ty = T.memref(tensorSize, T.i8())
+            memRef_wts_ty = T.memref(weights, T.i8())
             # memRef_16x16_ty = T.memref(16, 16, T.i32())
 
             @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
@@ -154,19 +150,19 @@ def sequence(I, W, O):
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=I,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=O,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=2,
                     mem=W,
-                    sizes=[1, 1, 1, weightsInInt32s],
+                    sizes=[1, 1, 1, weights],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
index 98ff6fe624..efd1b13555 100644
--- a/programming_examples/ml/conv2d_fused_relu/aie2.py
+++ b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -25,14 +25,11 @@
 
 actIn = width * in_channels  # 32*64 = 2048
 bufIn = actIn * 2  # double buffer
-actInInt32s = actIn // 4
 
 weights = in_channels * out_channels
-weightsInInt32s = weights // 4
 
 actOut = width * out_channels  # 32*64 = 2048
 bufOut = actOut * 2  # double buffer
-actOutInt32s = actOut // 4
 
 enableTrace = False
 trace_size = 16384
@@ -148,9 +145,8 @@ def core_body():
             # To/from AIE-array data movement
 
             tensorSize = width * height * in_channels
-            tensorSizeInInt32s = tensorSize // 4
-            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
-            memRef_wts_ty = T.memref(weightsInInt32s, T.i32())
+            tensor_ty = T.memref(tensorSize, T.i8())
+            memRef_wts_ty = T.memref(weights, T.i8())
             # memRef_16x16_ty = T.memref(16, 16, T.i32())
 
             @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
@@ -240,19 +236,19 @@ def sequence(I, W, O):
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=I,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=O,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=2,
                     mem=W,
-                    sizes=[1, 1, 1, weightsInInt32s],
+                    sizes=[1, 1, 1, weights],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py
index 354e9f78d1..4d0716fa1c 100644
--- a/programming_examples/ml/eltwise_add/aie2.py
+++ b/programming_examples/ml/eltwise_add/aie2.py
@@ -21,10 +21,6 @@ def my_eltwise_add(trace_size):
     N = 65536
     N_in_bytes = N * word_size_in
 
-    A_sz_in_i32s = N_in_bytes // 4
-    B_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
-
     # Tile sizes
     n = 1024
     N_div_n = N // n
@@ -129,7 +125,7 @@ def core_body():
                     yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i32())
+        tensor_ty = T.memref(N, T.bf16())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
@@ -143,15 +139,9 @@ def sequence(A, B, C):
                     offset=N_in_bytes,
                 )
 
-            npu_dma_memcpy_nd(
-                metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
-            )
+            npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py
index 5808d0c998..4966ecd06e 100644
--- a/programming_examples/ml/eltwise_mul/aie2.py
+++ b/programming_examples/ml/eltwise_mul/aie2.py
@@ -21,10 +21,6 @@ def my_eltwise_mul(trace_size):
     N = 65536
     N_in_bytes = N * word_size_in
 
-    A_sz_in_i32s = N_in_bytes // 4
-    B_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
-
     # Tile sizes
     n = 1024
     N_div_n = N // n
@@ -130,7 +126,7 @@ def core_body():
                     yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i32())
+        tensor_ty = T.memref(N, T.bf16())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
@@ -144,15 +140,9 @@ def sequence(A, B, C):
                     offset=N_in_bytes,
                 )
 
-            npu_dma_memcpy_nd(
-                metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
-            )
+            npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py
index e4da4eafdf..2d62135f27 100644
--- a/programming_examples/ml/relu/aie2.py
+++ b/programming_examples/ml/relu/aie2.py
@@ -21,9 +21,6 @@ def my_relu(trace_size):
     N = 65536
     N_in_bytes = N * word_size_in
 
-    A_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
-
     # Tile sizes
     n = 1024
     N_div_n = N // n
@@ -105,7 +102,7 @@ def core_body():
                     yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i32())
+        tensor_ty = T.memref(N, T.bf16())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
@@ -118,12 +115,8 @@ def sequence(A, C):
                     size=trace_size,
                     offset=N_in_bytes,
                 )
-            npu_dma_memcpy_nd(
-                metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-            )
+            npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 94f5888512..729ed2b0fb 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -894,33 +894,29 @@ def core_body():
                         yield_([])
 
             # instruction stream generation
-            activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4
-            acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4
+            activationsIn = tensorInW * tensorInH * tensorInCInit
+            acitivationsOut = tensorInW * tensorInH * tensorInCRest
 
-            totalWeightsSize32b_init = (
+            totalWeights_init = (
                 tensorInCInit * tensorInCInit
                 + 3 * 3 * tensorInCInit * tensorInCInit
                 + 2 * tensorInCInit * tensorInCRest
-            ) // 4
+            )
 
-            totalWeightsSize32b_rest = (
+            totalWeights_rest = (
                 tensorInCInit * tensorInCRest
                 + 3 * 3 * tensorInCInit * tensorInCInit
                 + tensorInCInit * tensorInCRest
-            ) // 4
-
-            totalWeightsSize32b_complete = (
-                totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest
             )
 
-            activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty)
-            activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty)
-            weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty)
-            weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty)
+            totalWeights_complete = totalWeights_init + repeat * totalWeights_rest
 
-            weightsInL3_ty_complete = MemRefType.get(
-                (totalWeightsSize32b_complete,), int32_ty
-            )
+            activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty)
+            activationsOutL3_ty = MemRefType.get((acitivationsOut,), int8_ty)
+            weightsInL3_ty_init = MemRefType.get((totalWeights_init,), int8_ty)
+            weightsInL3_ty_rest = MemRefType.get((totalWeights_rest,), int8_ty)
+
+            weightsInL3_ty_complete = MemRefType.get((totalWeights_complete,), int8_ty)
 
             @FuncOp.from_py_func(
                 activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty
@@ -950,27 +946,27 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     metadata="act1_00_02_01",
                     bd_id=0,
                     mem=inputFromL3,
-                    sizes=[1, 1, 1, activationsInSize32b],
+                    sizes=[1, 1, 1, activationsIn],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=outputToL3,
-                    sizes=[1, 1, 1, acitivationsOutSize32b],
+                    sizes=[1, 1, 1, acitivationsOut],
                 )
                 npu_dma_memcpy_nd(
                     metadata="wts_0_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
-                    sizes=[1, 1, 1, totalWeightsSize32b_init],
+                    sizes=[1, 1, 1, totalWeights_init],
                 )
 
                 npu_dma_memcpy_nd(
                     metadata="wts_1_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
-                    offsets=[0, 0, 0, totalWeightsSize32b_init],
-                    sizes=[1, 1, 1, totalWeightsSize32b_rest],
+                    offsets=[0, 0, 0, totalWeights_init],
+                    sizes=[1, 1, 1, totalWeights_rest],
                 )
 
                 npu_dma_memcpy_nd(
@@ -981,9 +977,9 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                         0,
                         0,
                         0,
-                        totalWeightsSize32b_init + totalWeightsSize32b_rest,
+                        totalWeights_init + totalWeights_rest,
                     ],
-                    sizes=[1, 1, 1, totalWeightsSize32b_rest],
+                    sizes=[1, 1, 1, totalWeights_rest],
                 )
 
                 npu_sync(column=1, row=0, direction=0, channel=0)
diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py
index 47d60adf6a..812bd71781 100755
--- a/programming_examples/ml/softmax/aie2.py
+++ b/programming_examples/ml/softmax/aie2.py
@@ -22,9 +22,6 @@ def vector_softmax(trace_size):
     N = 262144  # *1024
     N_in_bytes = N * word_size_in
 
-    A_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
-
     # Tile sizes
     n = 1024
     N_div_n = N // n
@@ -108,7 +105,7 @@ def core_body():
                     yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i32())
+        tensor_ty = T.memref(N, T.bf16())
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
@@ -122,12 +119,8 @@ def sequence(A, C):
                     offset=N_in_bytes,
                 )
 
-            npu_dma_memcpy_nd(
-                metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-            )
-            npu_dma_memcpy_nd(
-                metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-            )
+            npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py
index 19e4e04ca9..9a66785bbb 100644
--- a/programming_examples/vision/color_detect/aie2_colorDetect.py
+++ b/programming_examples/vision/color_detect/aie2_colorDetect.py
@@ -22,11 +22,9 @@
 
 lineWidth = width
 lineWidthInBytes = width * 4
-lineWidthInInt32s = lineWidthInBytes // 4
 
 enableTrace = False
-traceSizeInBytes = 8192
-traceSizeInInt32s = traceSizeInBytes // 4
+traceSize = 1024
 
 
 def color_detect():
@@ -242,8 +240,7 @@ def coreBody():
             # To/from AIE-array data movement
 
             tensorSize = width * height * 4  # 4 channels
-            tensorSizeInInt32s = tensorSize // 4
-            tensor_ty = MemRefType.get((tensorSizeInInt32s,), T.i32())
+            tensor_ty = MemRefType.get((tensorSize,), T.i8())
             memRef_16x16_ty = MemRefType.get(
                 (
                     16,
@@ -258,13 +255,13 @@ def sequence(I, B, O):
                     metadata="inOF_L3L2",
                     bd_id=1,
                     mem=I,
-                    sizes=[1, 1, 1, height * lineWidthInInt32s],
+                    sizes=[1, 1, 1, height * lineWidthInBytes],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOF_L2L3",
                     bd_id=0,
                     mem=O,
-                    sizes=[1, 1, 1, height * lineWidthInInt32s],
+                    sizes=[1, 1, 1, height * lineWidthInBytes],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index 1215a4ddd0..fa067226dc 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -247,12 +247,11 @@ def core_body():
             # To/from AIE-array data movement
 
             tensorSize = width * height
-            tensorSizeInInt32s = tensorSize // 4
 
             @FuncOp.from_py_func(
-                T.memref(tensorSizeInInt32s, T.i32()),
+                T.memref(tensorSize, T.i8()),
                 T.memref(32, T.i32()),  # not used
-                T.memref(tensorSizeInInt32s, T.i32()),
+                T.memref(tensorSize, T.i8()),
             )
             def sequence(inTensor, notUsed, outTensor):
                 # thresholdValue, maxValue, thresholdType
@@ -276,13 +275,13 @@ def sequence(inTensor, notUsed, outTensor):
                     metadata="inOOB_L3L2",
                     bd_id=1,
                     mem=inTensor,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="outOOB_L2L3",
                     bd_id=0,
                     mem=outTensor,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index 1af069d94e..3e095e356d 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -22,7 +22,6 @@
 heightMinus1 = height - 1
 lineWidth = width
 lineWidthInBytes = width * 4
-lineWidthInInt32s = lineWidthInBytes // 4
 
 enableTrace = False
 traceSizeInBytes = 8192
@@ -294,8 +293,7 @@ def core_body():
             # To/from AIE-array data movement
 
             tensorSize = width * height * 4  # 4 channels
-            tensorSizeInInt32s = tensorSize // 4
-            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+            tensor_ty = T.memref(tensorSize, T.i8())
             memRef_16x16_ty = T.memref(16, 16, T.i32())
 
             @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty)
@@ -304,13 +302,13 @@ def sequence(I, B, O):
                     metadata="outOF_L2L3",
                     bd_id=0,
                     mem=O,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="inOF_L3L2",
                     bd_id=1,
                     mem=I,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py
index 8d568af388..35e1c5f515 100644
--- a/programming_examples/vision/vision_passthrough/aie2.py
+++ b/programming_examples/vision/vision_passthrough/aie2.py
@@ -19,7 +19,6 @@
     height = int(sys.argv[2])
 
 lineWidthInBytes = width
-lineWidthInInt32s = lineWidthInBytes // 4
 
 enableTrace = False
 traceSizeInBytes = 8192
@@ -68,8 +67,7 @@ def core_body():
             #    print(ctx.module.operation.verify())
 
             tensorSize = width * height
-            tensorSizeInInt32s = tensorSize // 4
-            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+            tensor_ty = T.memref(tensorSize, T.i8())
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(inTensor, notUsed, outTensor):
@@ -157,13 +155,13 @@ def sequence(inTensor, notUsed, outTensor):
                     metadata="in",
                     bd_id=1,
                     mem=inTensor,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_dma_memcpy_nd(
                     metadata="out",
                     bd_id=0,
                     mem=outTensor,
-                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                    sizes=[1, 1, 1, tensorSize],
                 )
                 npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
new file mode 100644
index 0000000000..6f0ed03057
--- /dev/null
+++ b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
@@ -0,0 +1,29 @@
+//===- bad_dma_to_npu.mlir --------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+// Date: July 3rd 2023
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s
+
+// CHECK:     error: 'aiex.npu.dma_memcpy_nd' op Minimum data transfer size required is 32bits.
+
+
+module @shimDmaMemcpy{
+  aie.device(xcve2302) {
+    memref.global "public" @toMem : memref<1xbf16>
+    func.func @sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+    aie.shim_dma_allocation @toMem (S2MM, 0, 0)
+  }
+}
+
diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
new file mode 100644
index 0000000000..bb4af49938
--- /dev/null
+++ b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
@@ -0,0 +1,29 @@
+//===- bad_dma_to_npu_datatype.mlir --------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+// Date: July 3rd 2023
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s
+
+// CHECK:    error: 'aiex.npu.dma_memcpy_nd' op Maximum element bit width allowed is 32bits.
+
+
+module @shimDmaMemcpy{
+  aie.device(xcve2302) {
+    memref.global "public" @toMem : memref<65536xi64>
+    func.func @sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+    aie.shim_dma_allocation @toMem (S2MM, 0, 0)
+  }
+}
+
diff --git a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
new file mode 100644
index 0000000000..af75cd8b33
--- /dev/null
+++ b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
@@ -0,0 +1,40 @@
+//===- dma_to_npu_width_conversion.mlir --------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+// Date: July 3rd 2023
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s
+
+//CHECK-LABEL:  aie.device(xcve2302) {
+//CHECK:      memref.global "public" @toMem : memref<65536xbf16>
+//CHECK:      func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
+//CHECK:      aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 8192 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 32 : i32, d0_stride = 0 : i32, d1_size = 64 : i32, d1_stride = 127 : i32, d2_stride = 31 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+//CHECK:      aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
+//CHECK:      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147680256 : ui32}
+//CHECK:      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+//CHECK:      return
+//CHECK:    }
+//CHECK:    aie.shim_dma_allocation @toMem(S2MM, 0, 0)
+//CHECK:  }
+
+
+
+module @shimDmaMemcpy{
+  aie.device(xcve2302) {
+    memref.global "public" @toMem : memref<65536xbf16>
+    func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+    aie.shim_dma_allocation @toMem (S2MM, 0, 0)
+  }
+}
+
diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir
index 7881d80cfe..c89587b3f1 100644
--- a/test/dialect/AIEX/bad_npu_nd.mlir
+++ b/test/dialect/AIEX/bad_npu_nd.mlir
@@ -66,17 +66,3 @@ module {
 
 // -----
 
-module {
-  aie.device(npu1_4col) {
-    func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) {
-      %c0 = arith.constant 0 : i64
-      %c1 = arith.constant 1 : i64
-      %c1920 = arith.constant 1920 : i64
-      %c1080 = arith.constant 1080 : i64
-      // expected-error@+1 {{must be used with memref type with element width 32.}}
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8>
-      return
-    }
-    aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
-  }
-}