diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
index 9f20d9222c..cb8fe6f5bb 100644
--- a/include/aie/Dialect/AIEX/IR/AIEX.td
+++ b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -509,6 +509,32 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   let extraClassDeclaration = [{
     static unsigned getOffsetSizeAndStrideStartOperandIndex();
     static std::array<unsigned, 3> getArrayAttrMaxRanks();
+
+    /* Returns the provided multi-dimensional data transfer strides in units of
+       address granularity. In the IR, we express strides in units of element
+       data type, but the hardware requires it in units of address granularity.
+       Address granularity currently is 4 bytes for all hardware.
+
+       The returned stride[0] is the second-lowest dimension stride, i.e.
+       stride 1. The lowest stride is currently implicitly one, but this is not
+       a hardware requirement and could be changed in the future.  */
+    llvm::SmallVector<int64_t, 3> getStridesInAddressGranularity();
+
+    /* Returns the multi-dimensional data transfer sizes in units of address
+       granularity. These sizes are expressed in units of element data type in
+       the IR, but the hardware requires them to be in units of address
+       granularity. Address granularity currently is 4 bytes for all hardware. 
+
+       The returned size[0] is the lowest dimension size. In the IR, the sizes
+       are given in reverse order. For example, specifying sizes in IR as
+       [1, 2, 3, 4] would result in this function returning [4, 3, 2, 1].
+       */
+    llvm::SmallVector<int64_t, 4> getSizesInAddressGranularity();
+
+    /* Returns the data transfer offset in bytes, i.e. the first N bytes of the
+       target buffer will be skipped. In the IR, offsets are expressed in units
+       of memref element data type size. */
+    int64_t getOffsetInBytes(); 
   }];
 
   let extraClassDefinition = [{
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index e1102b4fe3..0f8ddc5f7c 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -64,10 +64,69 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
   return success();
 }
 
+llvm::SmallVector<int64_t, 3>
+AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
+  const auto &targetModel = AIE::getTargetModel(*this);
+  MemRefType buffer = getMemref().getType();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+  auto addressGranularity = targetModel.getAddressGenGranularity();
+  llvm::SmallVector<int64_t, 3> strides =
+      llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  if (!strides.empty()) {
+    for (int i = 0; i < 3; i++) {
+      strides[i] = (strides[i] * elemWidth) / addressGranularity;
+    }
+  }
+  return strides;
+}
+
+llvm::SmallVector<int64_t, 4>
+AIEX::NpuDmaMemcpyNdOp::getSizesInAddressGranularity() {
+  const auto &targetModel = AIE::getTargetModel(*this);
+  MemRefType buffer = getMemref().getType();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+  auto addressGranularity = targetModel.getAddressGenGranularity();
+  llvm::SmallVector<int64_t, 4> sizes =
+      llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  if (!sizes.empty()) {
+    sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
+  }
+  return sizes;
+}
+
+/* Calculates the offset value to be written to the
+ */
+int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
+  llvm::SmallVector<int64_t, 4> offsets =
+      llvm::map_to_vector(llvm::reverse(getMixedOffsets()), [](OpFoldResult s) {
+        return getConstantIntValue(s).value();
+      });
+  size_t stride = 1;
+  size_t offset = 0;
+  MemRefType my_memref = getMemref().getType();
+  auto shape = my_memref.getShape();
+  size_t R = shape.size();
+  size_t el_bit_width = my_memref.getElementTypeBitWidth();
+  assert(el_bit_width % 8 == 0 &&
+         "Expected Memref element bitwidth to be multiple of 8.");
+  size_t S = el_bit_width / 8;
+  for (size_t i = 0; i < R; i++) {
+    offset += offsets[i] * stride * S;
+    stride *= shape[R - i - 1];
+  }
+  return offset;
+}
+
 LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   MemRefType buffer = getMemref().getType();
   const auto &targetModel = AIE::getTargetModel(*this);
   auto addressGranularity = targetModel.getAddressGenGranularity();
+  auto elemWidth = buffer.getElementTypeBitWidth();
+
   if (buffer.getElementTypeBitWidth() > addressGranularity) {
     return emitOpError("Maximum element bit width allowed is ")
            << addressGranularity << "bits. ";
@@ -79,25 +138,29 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant strides currently supported.");
+    return emitOpError("Only constant strides currently supported.");
   if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant sizes currently supported.");
+    return emitOpError("Only constant sizes currently supported.");
   if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
         return getConstantIntValue(s).has_value();
       }))
-    llvm::report_fatal_error("Only constant offsets currently supported.");
+    return emitOpError("Only constant offsets currently supported.");
 
-  llvm::SmallVector<int64_t, 3> strides =
+  llvm::SmallVector<int64_t, 3> raw_strides =
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
-  llvm::SmallVector<int64_t, 4> sizes =
+  llvm::SmallVector<int64_t, 4> raw_sizes =
       llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
 
+  llvm::SmallVector<int64_t, 3> strides = getStridesInAddressGranularity();
+  llvm::SmallVector<int64_t, 4> sizes = getSizesInAddressGranularity();
+  int64_t offset = getOffsetInBytes();
+
   if (sizes[3] > 64)
     return emitOpError("Size 3 exceeds the [1:64] range.");
   if (strides[1] && sizes[1] > 0x3FF)
@@ -110,6 +173,36 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
     return emitOpError("Stride 2 exceeds the [1:1M] range.");
   if (strides[0] > 0x100000)
     return emitOpError("Stride 1 exceeds the [1:1M] range.");
+
+  if (offset % 4 != 0) {
+    return emitOpError("Offset must be 4-byte-aligned.");
+  }
+
+  bool error = false;
+  std::stringstream msg;
+  for (int i = 0; i < 3; i++) {
+    if (raw_strides[i] * elemWidth % addressGranularity != 0) {
+      error = true;
+      msg << "Stride " << i << " is " << raw_strides[i] << " elements * "
+          << (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8)
+          << " bytes, which is not divisible by " << (addressGranularity / 8)
+          << ". ";
+    }
+  }
+  if (error) {
+    return emitOpError(msg.str());
+  }
+
+  if (raw_sizes[0] * elemWidth % addressGranularity != 0) {
+    std::stringstream msg;
+    msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
+        << " bytes. " << raw_sizes[0] << " elements at " << (elemWidth / 8)
+        << " bytes each equal " << (raw_sizes[0] * elemWidth / 8)
+        << " bytes, which is not divisible by " << (addressGranularity / 8)
+        << ". ";
+    return emitOpError(msg.str());
+  }
+
   return success();
 }
 
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index 94d514c23a..58eb893af7 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -219,31 +219,9 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
 
-    llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
-        llvm::reverse(op.getMixedStrides()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
-        llvm::reverse(op.getMixedSizes()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> offsets = llvm::map_to_vector(
-        llvm::reverse(op.getMixedOffsets()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-
-    MemRefType buffer = op.getMemref().getType();
-    const auto &targetModel = AIE::getTargetModel(op);
-    auto elemWidth = buffer.getElementTypeBitWidth();
-    auto addressGranularity = targetModel.getAddressGenGranularity();
-    if (elemWidth < addressGranularity) {
-      if (!strides.empty()) {
-        for (int i = 0; i < 3; i++) {
-          strides[i] = (strides[i] * elemWidth) / addressGranularity;
-        }
-      }
-      if (!sizes.empty())
-        sizes[0] = (sizes[0] * elemWidth) / addressGranularity;
-      if (!offsets.empty())
-        offsets[0] = (offsets[0] * elemWidth) / addressGranularity;
-    }
+    llvm::SmallVector<int64_t, 3> strides = op.getStridesInAddressGranularity();
+    llvm::SmallVector<int64_t, 4> sizes = op.getSizesInAddressGranularity();
+    int64_t offset = op.getOffsetInBytes();
 
     // column
     column = IntegerAttr::get(i32ty, col);
@@ -271,19 +249,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     buffer_length = IntegerAttr::get(i32ty, repeat_length);
 
     // buffer_offset
-    size_t stride = 1;
-    size_t offset = 0;
-    MemRefType my_memref = op.getMemref().getType();
-    auto shape = my_memref.getShape();
-    size_t R = shape.size();
-    size_t el_bit_width = my_memref.getElementTypeBitWidth();
-    assert(el_bit_width % 8 == 0 &&
-           "Expected Memref element bitwidth to be multiple of 8.");
-    size_t S = el_bit_width / 8;
-    for (size_t i = 0; i < R; i++) {
-      offset += offsets[i] * stride * S;
-      stride *= shape[R - i - 1];
-    }
     buffer_offset = IntegerAttr::get(i32ty, offset);
 
     // enable_packet
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 6b6a48e400..cad1d9f52d 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -231,7 +231,7 @@ def sequence(A, B, C):
                     for tile_row in range(num_tile_rows):
                         A_row_offset = (
                             ((tile_row_block * rows_per_block) + tile_row) * m * K
-                        ) * 2
+                        )
                         npu_dma_memcpy_nd(
                             metadata="inA",
                             bd_id=2 * tile_row + 1,
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 4adb1cd7e2..2ecf80cbcb 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -317,7 +317,7 @@ def sequence(A, B, C):
                     C_row_offset = tile_row_block * rows_per_block * m * n_rows * N
                     for i in range(n_cols):
                         C_col_offset = i * n
-                        C_offset = (C_col_offset + C_row_offset) * 2
+                        C_offset = C_col_offset + C_row_offset
                         npu_dma_memcpy_nd(
                             metadata=outC_fifo_names[i],
                             bd_id=0,
@@ -334,8 +334,8 @@ def sequence(A, B, C):
                                 * K
                             )
                             A_col_offset = i * m * K
-                            A_offset = (A_row_offset + A_col_offset) * 2
-                            B_col_offset = i * n * 2
+                            A_offset = A_row_offset + A_col_offset
+                            B_col_offset = i * n
                             npu_dma_memcpy_nd(
                                 metadata=inA_fifo_names[i],
                                 bd_id=2 * tile_row + 1,
diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir
index c89587b3f1..e49530e325 100644
--- a/test/dialect/AIEX/bad_npu_nd.mlir
+++ b/test/dialect/AIEX/bad_npu_nd.mlir
@@ -66,3 +66,103 @@ module {
 
 // -----
 
+// Offsets need to be 4-byte aligned.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd_stride(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c8 = arith.constant 8 : i64
+      // expected-error@+1 {{Offset must be 4-byte-aligned}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @fifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+// Strides and sizes expressed in types other than i32 should not overflow hardware limitations when converted to 4-byte granularity.
+// The following tests check this.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64 
+      %c8 = arith.constant 8 : i64
+      %c2048 = arith.constant 2048 : i64
+      // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s,
+      // this should be a size of 512 in address granularity (4 bytes) and hence pass the test.
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi16>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64
+      %c8 = arith.constant 8 : i64
+      %c2048 = arith.constant 2048 : i64
+      // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+// Strides and sizes are expressed at 4-byte-granularity in hardware, but we express them at memref element type granularity.
+// The following tests make sure the proper errors are generated when this is not possible.
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64  // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible
+      %c8 = arith.constant 8 : i64
+      %c1920 = arith.constant 1920 : i64
+      %c1080 = arith.constant 1080 : i64
+      // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+module {
+  aie.device(npu1_4col) {
+    func.func @bad_npu_nd(%a : memref<8xi8>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c4 = arith.constant 4 : i64
+      %c8 = arith.constant 8 : i64
+      %c1920 = arith.constant 1920 : i64
+      %c1080 = arith.constant 1080 : i64
+      // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
diff --git a/test/npu-xrt/nd_memcpy_transforms/aie2.py b/test/npu-xrt/nd_memcpy_transforms/aie2.py
new file mode 100644
index 0000000000..3df95e9493
--- /dev/null
+++ b/test/npu-xrt/nd_memcpy_transforms/aie2.py
@@ -0,0 +1,118 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai
+#
+# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
+# RUN: %python %S/aie2.py > %S/aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie2.mlir
+# RUN: clang %S/test.cpp -o test -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+# RUN: %run_on_npu ./test | FileCheck %s
+# CHECK: PASS!
+
+from aie.extras.context import mlir_mod_ctx
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+
+
+dtype = T.i16
+a_len = 8
+b_len = 12
+c_offset = 2
+c_len = a_len + b_len
+
+
+def memref_sz(m: MemRefType):
+    sz = 1
+    for s in m.shape:
+        sz *= s
+    return sz
+
+
+def design():
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.npu1_4col)
+        def device_body():
+            memref_a = T.memref(a_len, dtype())
+            memref_b = T.memref(b_len, dtype())
+            memref_c = T.memref(c_len, dtype())
+
+            concat_func = external_func(
+                "concat",
+                inputs=[memref_a, memref_b, memref_c, T.i32(), T.i32(), T.i32()],
+            )
+
+            # Tile declarations as tile[row][col]
+            tiles = [[tile(col, row) for col in range(0, 4)] for row in range(0, 6)]
+            # Shim tiles: tiles[0][0..3]
+            # Mem tiles: tiles[1][0..3]
+            # Cores: tiles[2..5][0..3]
+
+            fifo_a = object_fifo("fifo_a", tiles[0][0], tiles[2][0], 2, memref_a)
+            fifo_b = object_fifo("fifo_b", tiles[0][0], tiles[2][0], 2, memref_b)
+            fifo_c = object_fifo("fifo_c", tiles[2][0], tiles[0][0], 2, memref_c)
+
+            # Core
+            @core(tiles[2][0], "kernel.o")
+            def core_body():
+                for _ in for_(0, 0xFFFFFFFF):
+                    elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1)
+                    elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1)
+                    elem_b = fifo_b.acquire(ObjectFifoPort.Consume, 1)
+                    call(
+                        concat_func,
+                        [
+                            elem_a,
+                            elem_b,
+                            elem_c,
+                            memref_sz(memref_a),
+                            memref_sz(memref_b),
+                            memref_sz(memref_c),
+                        ],
+                    )
+                    fifo_a.release(ObjectFifoPort.Consume, 1)
+                    fifo_b.release(ObjectFifoPort.Consume, 1)
+                    fifo_c.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            @FuncOp.from_py_func(memref_a, memref_b, memref_c)
+            def sequence(A, B, C):
+                npu_dma_memcpy_nd(
+                    metadata=fifo_a.sym_name.value,
+                    bd_id=1,
+                    mem=A,
+                    offsets=[0, 0, 0, 0],
+                    sizes=[1, a_len // 4, 2, 2],
+                    strides=[0, 2, a_len // 2],
+                )
+                npu_dma_memcpy_nd(
+                    metadata=fifo_b.sym_name.value,
+                    bd_id=1,
+                    mem=B,
+                    offsets=[0, 0, 0, 0],
+                    sizes=[1, 2, b_len // 4, 2],
+                    strides=[0, 2, 4],
+                )
+                npu_dma_memcpy_nd(
+                    metadata=fifo_c.sym_name.value,
+                    bd_id=0,
+                    mem=C,
+                    offsets=[0, 0, 0, c_offset],
+                    sizes=[1, 1, 1, c_len],
+                    strides=[0, 0, 0],
+                )
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+design()
diff --git a/test/npu-xrt/nd_memcpy_transforms/kernel.cc b/test/npu-xrt/nd_memcpy_transforms/kernel.cc
new file mode 100644
index 0000000000..ef3ac9691a
--- /dev/null
+++ b/test/npu-xrt/nd_memcpy_transforms/kernel.cc
@@ -0,0 +1,20 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+typedef int16_t my_t;
+
+extern "C" {
+void concat(my_t *a, my_t *b, my_t *c, int a_sz, int b_sz, int c_sz) {
+  // Concatenates a and b and writes the result to c.
+  int i = 0;
+  for (; i < c_sz && i < a_sz; i++) {
+    c[i] = a[i];
+  }
+  for (; i < c_sz && i - a_sz < b_sz; i++) {
+    c[i] = b[i - a_sz];
+  }
+}
+}
diff --git a/test/npu-xrt/nd_memcpy_transforms/test.cpp b/test/npu-xrt/nd_memcpy_transforms/test.cpp
new file mode 100644
index 0000000000..646496e08a
--- /dev/null
+++ b/test/npu-xrt/nd_memcpy_transforms/test.cpp
@@ -0,0 +1,143 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+// --------------------------------------------------------------------------
+// AIE Specifics
+// --------------------------------------------------------------------------
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+#ifndef XCLBIN
+#define XCLBIN "final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define DTYPE int16_t
+#define A_DATATYPE DTYPE
+#define B_DATATYPE DTYPE
+#define C_DATATYPE DTYPE
+
+#define A_LEN 8
+#define B_LEN 12
+#define C_OFFSET 2
+#define C_LEN (A_LEN + B_LEN + C_OFFSET)
+
+#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes
+#define B_SIZE (B_LEN * sizeof(B_DATATYPE)) // in bytes
+#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a =
+      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b =
+      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_c =
+      xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  A_DATATYPE *buf_a = bo_a.map<A_DATATYPE *>();
+  for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) {
+    buf_a[i] = 2 * i; // even
+  }
+  B_DATATYPE *buf_b = bo_b.map<A_DATATYPE *>();
+  for (int i = 0; i < B_SIZE / sizeof(buf_b[0]); i++) {
+    buf_b[i] = 2 * i + 1; // odd
+  }
+  C_DATATYPE *buf_c = bo_c.map<C_DATATYPE *>();
+  memset(buf_c, 0, C_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) {
+    std::cout << std::setw(4) << (long)buf_c[i] << " ";
+  }
+  std::cout << std::endl;
+
+  C_DATATYPE ref[] = {0, 0, 0,  2,  8,  10, 4, 6,  12, 14, 1,
+                      3, 9, 11, 17, 19, 5,  7, 13, 15, 21, 23};
+  if (memcmp(ref, buf_c, sizeof(ref)) == 0) {
+    std::cout << "PASS!" << std::endl;
+  } else {
+    std::cout << "FAIL." << std::endl;
+  }
+
+  return 0;
+}