From 312a989eebf115e3eb1d57bb88bac33f265e9897 Mon Sep 17 00:00:00 2001
From: Philip James-Roxby <phil.jamesroxby@gmail.com>
Date: Thu, 25 Apr 2024 14:08:24 -0600
Subject: [PATCH] Pjr docs (#1415)

Co-authored-by: pjr <pjr@xilinx.com>
Co-authored-by: Kristof Denolf <kristof.denolf@amd.com>
Co-authored-by: Joseph Melber <jgmelber@gmail.com>
---
 .../basic/matrix_scalar_add/README.md         |  40 ++++-
 .../basic/matrix_scalar_add/aie2.py           | 142 +++++++++---------
 .../basic/vector_scalar_add/README.md         |  17 ++-
 .../basic/vector_scalar_add/aie2.py           |  34 +++--
 .../basic/vector_scalar_add/test.cpp          | 102 ++++---------
 .../basic/vector_vector_add/README.md         |  38 ++++-
 .../basic/vector_vector_add/aie2.py           | 130 ++++++++--------
 .../basic/vector_vector_add/test.cpp          |  96 ++++--------
 .../basic/vector_vector_mul/README.md         |  40 ++++-
 .../basic/vector_vector_mul/aie2.py           | 132 ++++++++--------
 .../basic/vector_vector_mul/test.cpp          | 101 ++++---------
 11 files changed, 438 insertions(+), 434 deletions(-)
diff --git a/programming_examples/basic/matrix_scalar_add/README.md b/programming_examples/basic/matrix_scalar_add/README.md
index c29df4bfaf..304c7c3a43 100644
--- a/programming_examples/basic/matrix_scalar_add/README.md
+++ b/programming_examples/basic/matrix_scalar_add/README.md
@@ -10,18 +10,50 @@
 
 # <ins>Matrix Scalar Addition</ins>
 
-A single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000.
+This design shows an extremely simple single AIE design, which is incrementing every value in an input matrix.
+
+It shows a number of features which can then be expanded to more realistic designs.  
+
+Firstly, a 2D DMA pattern is set up to access data from the input and output memories. Small `8x16` subtiles are accessed from the larger `16x128` input and output matrix.  Thinking about input and output spaces are large grids, with smaller grids of work being dispatched to individual AIE cores is a fundamental, reusable concept.
+
+Secondly, the design shows how the bodies of work done by each AIE core is a combination of data movement (the object FIFO acquire and releases) together with compute, which in this case is expressed using a number of different MLIR dialects, like arith, memref, etc. next to mlir-aie.
+
+Finally, the overall structural design shows how complete designs are a combination of a static design, consisting of cores, connections and some part of the data movement, together with a run time sequence for controlling the design.
+
+## Functionality
+
+A single AIE core performs a very simple `+` operation where the kernel loads data from its local memory, increments the value by `1` and stores it back to the local memory. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI NPU or a VCK5000.
+
+The kernel executes on AIE tile (`col`, 2) - this is actually the first core in a column, as the shim tile is on row 0, and the mem tile is on row 1. Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. 
+
+
+## Usage
+
+### NPU
+
+To compile the design and C++ testbench:
 
-The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` depends on whether the application is targeting NPU or VCK5000. The Shim tile is programmed with a 2D DMA to bring only a 2D submatrix into the AIE tile for processing. 
 
-To compile and run the design for NPU:
 ```
 make
+make matrixAddOne
+```
+
+To run the design:
+
+```
 make run
 ```
 
-To compile and run the design for VCK5000:
+### VCK5000
+
+To compile the design and C++ testbench:
 ```
 make vck5000
+```
+
+To run the design 
+
+```
 ./test.elf
 ```
diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
index a80ba794e6..eeace84b6a 100644
--- a/programming_examples/basic/matrix_scalar_add/aie2.py
+++ b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -19,8 +19,8 @@
 IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT
 
 # Size of the tile we are processing
-TILE_WIDTH = 16
 TILE_HEIGHT = 8
+TILE_WIDTH = 16
 TILE_SIZE = TILE_WIDTH * TILE_HEIGHT
 
 NUM_3D = IMAGE_WIDTH / TILE_WIDTH
@@ -30,78 +30,76 @@
 
 
 def my_matrix_add_one():
-    with mlir_mod_ctx() as ctx:
-
-        if len(sys.argv) != 3:
-            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-
-        if sys.argv[1] == "npu":
-            dev = AIEDevice.npu
-        elif sys.argv[1] == "xcvc1902":
-            dev = AIEDevice.xcvc1902
-        else:
-            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(TILE_SIZE, T.i32())
-
-            # Tile declarations
-            ShimTile = tile(int(sys.argv[2]), 0)
-            ComputeTile2 = tile(int(sys.argv[2]), 2)
-
-            # AIE-array data movement with object fifos
-            # Input
-            of_in1 = object_fifo(
-                "in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty
-            )
 
-            # Output
-            of_out1 = object_fifo(
-                "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2)
-            def core_body():
-                # Effective while(1)
-                for _ in for_(8):
-                    elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
-                    for i in for_(TILE_SIZE):
-                        v0 = memref.load(elem_in, [i])
-                        v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                        memref.store(v1, elem_out, [i])
-                        yield_([])
-                    of_in1.release(ObjectFifoPort.Consume, 1)
-                    of_out1.release(ObjectFifoPort.Produce, 1)
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(TILE_SIZE, T.i32())
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        # Input
+        of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty)
+
+        # Output
+        of_out1 = object_fifo(
+            "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
+        )
+
+        # Set up compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+                for i in for_(TILE_SIZE):
+                    v0 = memref.load(elem_in, [i])
+                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
+                    memref.store(v1, elem_out, [i])
                     yield_([])
-
-            # To/from AIE-array data movement
-
-            tensor_ty = T.memref(TILE_SIZE, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(inTensor, notUsed, outTensor):
-                npu_dma_memcpy_nd(
-                    metadata="out0",
-                    bd_id=0,
-                    mem=outTensor,
-                    sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                    strides=[1, 1, IMAGE_WIDTH],
-                )
-                npu_dma_memcpy_nd(
-                    metadata="in0",
-                    bd_id=1,
-                    mem=inTensor,
-                    sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                    strides=[1, 1, IMAGE_WIDTH],
-                )
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
+                of_in1.release(ObjectFifoPort.Consume, 1)
+                of_out1.release(ObjectFifoPort.Produce, 1)
+                yield_([])
+
+        # To/from AIE-array data movement
+
+        tensor_ty = T.memref(TILE_SIZE, T.i32())
+
+        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(inTensor, notUsed, outTensor):
+            npu_dma_memcpy_nd(
+                metadata="out0",
+                bd_id=0,
+                mem=outTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH],
+            )
+            npu_dma_memcpy_nd(
+                metadata="in0",
+                bd_id=1,
+                mem=inTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH],
+            )
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
-my_matrix_add_one()
+with mlir_mod_ctx() as ctx:
+    my_matrix_add_one()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vector_scalar_add/README.md b/programming_examples/basic/vector_scalar_add/README.md
index 5223393ffe..1715d66305 100644
--- a/programming_examples/basic/vector_scalar_add/README.md
+++ b/programming_examples/basic/vector_scalar_add/README.md
@@ -10,15 +10,26 @@
 
 # Vector Scalar Addition:
 
-Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back.
+This design shows an extremely simple single AIE design, which is incrementing every value in an input vector.
 
-The kernel executes on AIE tile (0, 2). Input data is brought to the local memory of the tile from the Shim tile (0, 0) through the Mem tile (0, 1). The size of the input data from the Shim tile is `16xi32`. The data is stored in the Mem tile and sent to the AIE tile in smaller pieces of size `8xi32`. Output data from the AIE tile to the Shim tile follows the same process, in reverse.
+It shows a number of features which can then be expanded to more realistic designs.  
+
+Firstly, a simple 1D DMA pattern is set up to access data from the input and output memories. Small `64` element subtiles are accessed from the larger `1024` element input and output vectors.  Thinking about input and output spaces are large grids, with smaller grids of work being dispatched to individual AIE cores is a fundamental, reusable concept.
+
+Secondly, these `64` element subtiles which are now in the mem tile are split into two smaller `32` element subtiles, and sent to the AIE engine to be processed.  This shows how the multi-level memory hierarchy of the NPU can be used.
+
+Thirdly, the design shows how the bodies of work done by each AIE core is a combination of data movement (the object FIFO acquire and releases) together with compute, which in this case is expressed using a number of different MLIR dialects, like `arith`, `memref`, etc. next to `mlir-aie`.
+
+Finally, the overall structural design shows how complete designs are a combination of a static design, consisting of cores, connections and some part of the data movement, together with a run time sequence for controlling the design.
+A single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back.
+
+The kernel executes on AIE tile (0, 2). Input data is first brought to the Me tile in (0, 1) using the Shim tile (0, 0). The size of the input data from the Shim tile is `64xi32`. The data is stored in the Mem tile and sent to the AIE tile in smaller pieces of size `32xi32`. Output data from the AIE tile to the Shim tile follows the same process, in reverse.
 
 This example does not contain a C++ kernel file. The kernel is expressed in Python bindings for the `memref` and `arith` dialects that is then compiled with the AIE compiler to generate the AIE core binary.
 
 ## Source Files Overview
 
-1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
 
 1. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
 
diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
index b4e2a84b97..83b4bd8a8c 100644
--- a/programming_examples/basic/vector_scalar_add/aie2.py
+++ b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -11,13 +11,19 @@
 from aie.extras.dialects.ext import memref, arith
 from aie.extras.context import mlir_mod_ctx
 
+import sys
+
+PROBLEM_SIZE = 1024
+MEM_TILE_WIDTH = 64
+AIE_TILE_WIDTH = 32
+
 
 def my_vector_bias_add():
 
     @device(AIEDevice.npu)
     def device_body():
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_8_ty = T.memref(8, T.i32())
+        memRef_mem_tile_ty = T.memref(MEM_TILE_WIDTH, T.i32())
+        memRef_aie_tile_ty = T.memref(AIE_TILE_WIDTH, T.i32())
 
         # Tile declarations
         ShimTile = tile(0, 0)
@@ -26,13 +32,13 @@ def device_body():
 
         # AIE-array data movement with object fifos
         # Input
-        of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
-        of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
+        of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_mem_tile_ty)
+        of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_aie_tile_ty)
         object_fifo_link(of_in0, of_in1)
 
         # Output
-        of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
-        of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
+        of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_mem_tile_ty)
+        of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_aie_tile_ty)
         object_fifo_link(of_out1, of_out0)
 
         # Set up compute tiles
@@ -41,10 +47,10 @@ def device_body():
         @core(ComputeTile2)
         def core_body():
             # Effective while(1)
-            for _ in for_(8):
+            for _ in for_(sys.maxsize):
                 elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
                 elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
-                for i in for_(8):
+                for i in for_(AIE_TILE_WIDTH):
                     v0 = memref.load(elem_in, [i])
                     v1 = arith.addi(v0, arith.constant(1, T.i32()))
                     memref.store(v1, elem_out, [i])
@@ -54,17 +60,15 @@ def core_body():
                 yield_([])
 
         # To/from AIE-array data movement
+        tensor_ty = T.memref(PROBLEM_SIZE, T.i32())
 
-        memRef_64_ty = T.memref(64, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-
-        @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
-        def sequence(inTensor, notUsed, outTensor):
+        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
             npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
             npu_sync(column=0, row=0, direction=0, channel=0)
 
diff --git a/programming_examples/basic/vector_scalar_add/test.cpp b/programming_examples/basic/vector_scalar_add/test.cpp
index a48ce210ed..f92f856b37 100644
--- a/programming_examples/basic/vector_scalar_add/test.cpp
+++ b/programming_examples/basic/vector_scalar_add/test.cpp
@@ -20,79 +20,38 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-constexpr int IN_SIZE = 64;
-constexpr int OUT_SIZE = 64;
+#include "test_utils.h"
 
 namespace po = boost::program_options;
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
 int main(int argc, const char *argv[]) {
 
-  // Program arguments parsing
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
   po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
+  test_utils::add_default_options(desc);
 
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  constexpr int IN_SIZE = 1024;
+  constexpr int OUT_SIZE = 1024;
 
+  // Load instruction sequence
   std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // Start the XRT test code
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
   // Get a device handle
   unsigned int device_index = 0;
   auto device = xrt::device(device_index);
@@ -102,6 +61,7 @@ int main(int argc, const char *argv[]) {
     std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
   auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
 
+  // Load the kernel
   if (verbosity >= 1)
     std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
   std::string Node = vm["kernel"].as<std::string>();
@@ -109,37 +69,41 @@ int main(int argc, const char *argv[]) {
   // Get the kernel from the xclbin
   auto xkernels = xclbin.get_kernels();
   auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
                                  auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
                                  return name.rfind(Node, 0) == 0;
                                });
   auto kernelName = xkernel.get_name();
 
+  // Register xclbin
   if (verbosity >= 1)
     std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
               << "\n";
-
   device.register_xclbin(xclbin);
 
-  // get a hardware context
+  // Get a hardware context
   if (verbosity >= 1)
     std::cout << "Getting hardware context.\n";
   xrt::hw_context context(device, xclbin.get_uuid());
 
-  // get a kernel handle
+  // Get a kernel handle
   if (verbosity >= 1)
     std::cout << "Getting handle to kernel:" << kernelName << "\n";
   auto kernel = xrt::kernel(context, kernelName);
 
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
   auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
@@ -158,7 +122,7 @@ int main(int argc, const char *argv[]) {
 
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
-  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
   run.wait();
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
@@ -167,7 +131,7 @@ int main(int argc, const char *argv[]) {
 
   int errors = 0;
 
-  for (uint32_t i = 0; i < 64; i++) {
+  for (uint32_t i = 0; i < OUT_SIZE; i++) {
     uint32_t ref = i + 2;
     if (*(bufOut + i) != ref) {
       std::cout << "Error in output " << *(bufOut + i) << " != " << ref
diff --git a/programming_examples/basic/vector_vector_add/README.md b/programming_examples/basic/vector_vector_add/README.md
index c7dd75676a..70aed7e6ae 100644
--- a/programming_examples/basic/vector_vector_add/README.md
+++ b/programming_examples/basic/vector_vector_add/README.md
@@ -10,18 +10,48 @@
 
 # <ins>Vector Vector Add</ins>
 
-A single tile performs a very simple `+` operation from two vectors loaded into memory. The tile then stores the sum of those two vectors back to external memory. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
+A simple binary operator, which uses a single AIE core to add two vectors together.  The overall vector size in this design is `1024` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the Object FIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
 
-The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` depends on whether the application is targeting NPU or VCK5000. The AIE tile performs the summation operations, and the Shim tile brings the data back out to external memory.
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.
+
+## Source Files Overview
+
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+
+1. `test.cpp`: This C++ code is a testbench for the design example targeting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targeting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.
+
+## Ryzen™ AI Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
 
-To compile and run the design for NPU:
 ```
 make
+make vectorAdd.exe
+```
+
+To run the design:
+
+```
 make run
 ```
 
-To compile and run the design for VCK5000:
+## VCK5000 Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
 ```
 make vck5000
+```
+
+To run the design:
+
+```
 ./test.elf
 ```
+
diff --git a/programming_examples/basic/vector_vector_add/aie2.py b/programming_examples/basic/vector_vector_add/aie2.py
index 581729e6ec..433b6d1ea6 100755
--- a/programming_examples/basic/vector_vector_add/aie2.py
+++ b/programming_examples/basic/vector_vector_add/aie2.py
@@ -17,74 +17,76 @@
 
 
 def my_vector_add():
-    N = 64
+    N = 1024
     n = 16
     N_div_n = N // n
 
     buffer_depth = 2
 
-    with mlir_mod_ctx() as ctx:
-
-        if len(sys.argv) != 3:
-            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-
-        if sys.argv[1] == "npu":
-            dev = AIEDevice.npu
-        elif sys.argv[1] == "xcvc1902":
-            dev = AIEDevice.xcvc1902
-        else:
-            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(n, T.i32())
-
-            # AIE Core Function declarations
-
-            # Tile declarations
-            ShimTile = tile(int(sys.argv[2]), 0)
-            ComputeTile2 = tile(int(sys.argv[2]), 2)
-
-            # AIE-array data movement with object fifos
-            of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2)
-            def core_body():
-                # Effective while(1)
-                for _ in for_(sys.maxsize):
-                    # Number of sub-vector "tile" iterations
-                    for _ in for_(N_div_n):
-                        elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
-                        elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
-                        elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
-                        for i in for_(n):
-                            v0 = memref.load(elem_in1, [i])
-                            v1 = memref.load(elem_in2, [i])
-                            v2 = arith.addi(v0, v1)
-                            memref.store(v2, elem_out, [i])
-                            yield_([])
-                        of_in1.release(ObjectFifoPort.Consume, 1)
-                        of_in2.release(ObjectFifoPort.Consume, 1)
-                        of_out.release(ObjectFifoPort.Produce, 1)
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(n, T.i32())
+
+        # AIE Core Function declarations
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                for _ in for_(N_div_n):
+                    elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                    elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    for i in for_(n):
+                        v0 = memref.load(elem_in1, [i])
+                        v1 = memref.load(elem_in2, [i])
+                        v2 = arith.addi(v0, v1)
+                        memref.store(v2, elem_out, [i])
                         yield_([])
+                    of_in1.release(ObjectFifoPort.Consume, 1)
+                    of_in2.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
-
-            # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(A, B, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
-
-
-my_vector_add()
+                yield_([])
+
+        # To/from AIE-array data movement
+        tensor_ty = T.memref(N, T.i32())
+
+        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_add()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vector_vector_add/test.cpp b/programming_examples/basic/vector_vector_add/test.cpp
index 550915c698..f8e300036b 100644
--- a/programming_examples/basic/vector_vector_add/test.cpp
+++ b/programming_examples/basic/vector_vector_add/test.cpp
@@ -20,79 +20,38 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-constexpr int IN_SIZE = 64;
-constexpr int OUT_SIZE = 64;
+#include "test_utils.h"
 
 namespace po = boost::program_options;
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
 int main(int argc, const char *argv[]) {
 
-  // Program arguments parsing
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
   po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
+  test_utils::add_default_options(desc);
 
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  constexpr int IN_SIZE = 1024;
+  constexpr int OUT_SIZE = 1024;
 
+  // Load instruction sequence
   std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // Start the XRT test code
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
   // Get a device handle
   unsigned int device_index = 0;
   auto device = xrt::device(device_index);
@@ -102,6 +61,7 @@ int main(int argc, const char *argv[]) {
     std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
   auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
 
+  // Load the kernel
   if (verbosity >= 1)
     std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
   std::string Node = vm["kernel"].as<std::string>();
@@ -109,29 +69,35 @@ int main(int argc, const char *argv[]) {
   // Get the kernel from the xclbin
   auto xkernels = xclbin.get_kernels();
   auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
                                  auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
                                  return name.rfind(Node, 0) == 0;
                                });
   auto kernelName = xkernel.get_name();
 
+  // Register xclbin
   if (verbosity >= 1)
     std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
               << "\n";
-
   device.register_xclbin(xclbin);
 
-  // get a hardware context
+  // Get a hardware context
   if (verbosity >= 1)
     std::cout << "Getting hardware context.\n";
   xrt::hw_context context(device, xclbin.get_uuid());
 
-  // get a kernel handle
+  // Get a kernel handle
   if (verbosity >= 1)
     std::cout << "Getting handle to kernel:" << kernelName << "\n";
   auto kernel = xrt::kernel(context, kernelName);
 
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
@@ -174,7 +140,7 @@ int main(int argc, const char *argv[]) {
 
   int errors = 0;
 
-  for (uint32_t i = 0; i < 64; i++) {
+  for (uint32_t i = 0; i < OUT_SIZE; i++) {
     if (*(bufOut + i) != *(bufInA + i) + *(bufInB + i)) {
       std::cout << "Error in output " << *(bufOut + i)
                 << " != " << *(bufInA + i) << " + " << *(bufInB + i)
diff --git a/programming_examples/basic/vector_vector_mul/README.md b/programming_examples/basic/vector_vector_mul/README.md
index 331f832033..49c0964fce 100644
--- a/programming_examples/basic/vector_vector_mul/README.md
+++ b/programming_examples/basic/vector_vector_mul/README.md
@@ -8,20 +8,50 @@
 // 
 //===----------------------------------------------------------------------===//-->
 
-# <ins>Vector Vector Multiplication</ins>
+# <ins>Vector Vector Multiply</ins>
 
-A single tile performs a very simple `*` operation from two vectors loaded into memory. The tile then stores the element-wise multiplication of those two vectors back to external memory. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000.
+A simple binary operator, which uses a single AIE core to multiply two vectors together.  The overall vector size in this design is `1024` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the ObjectFIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
 
-The kernel executes on the AIE tile (`col`, 2). Both input vectors are brought into the tile from the Shim tile (`col`, 0). The value of `col` depends on whether the application targets NPU or VCK5000. The AIE tile performs the multiplication operations, and the Shim tile brings the data back out to external memory.
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory.
+
+## Source Files Overview
+
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+
+1. `test.cpp`: This C++ code is a testbench for the design example targetting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targetting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.
+
+## Ryzen™ AI Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
 
-To compile and run the design for NPU:
 ```
 make
+make vectorAdd.exe
+```
+
+To run the design:
+
+```
 make run
 ```
 
-To compile and run the design for VCK5000:
+## VCK5000 Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
 ```
 make vck5000
+```
+
+To run the design:
+
+```
 ./test.elf
 ```
+
diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py
index 209f5243bb..c16ba68103 100755
--- a/programming_examples/basic/vector_vector_mul/aie2.py
+++ b/programming_examples/basic/vector_vector_mul/aie2.py
@@ -16,75 +16,77 @@
 import sys
 
 
-def my_vector_add():
-    N = 64
+def my_vector_mul():
+    N = 1024
     n = 16
     N_div_n = N // n
 
     buffer_depth = 2
 
-    with mlir_mod_ctx() as ctx:
-
-        if len(sys.argv) != 3:
-            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-
-        if sys.argv[1] == "npu":
-            dev = AIEDevice.npu
-        elif sys.argv[1] == "xcvc1902":
-            dev = AIEDevice.xcvc1902
-        else:
-            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(n, T.i32())
-
-            # AIE Core Function declarations
-
-            # Tile declarations
-            ShimTile = tile(int(sys.argv[2]), 0)
-            ComputeTile2 = tile(int(sys.argv[2]), 2)
-
-            # AIE-array data movement with object fifos
-            of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2)
-            def core_body():
-                # Effective while(1)
-                for _ in for_(sys.maxsize):
-                    # Number of sub-vector "tile" iterations
-                    for _ in for_(N_div_n):
-                        elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
-                        elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
-                        elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
-                        for i in for_(n):
-                            v0 = memref.load(elem_in1, [i])
-                            v1 = memref.load(elem_in2, [i])
-                            v2 = arith.muli(v0, v1)
-                            memref.store(v2, elem_out, [i])
-                            yield_([])
-                        of_in1.release(ObjectFifoPort.Consume, 1)
-                        of_in2.release(ObjectFifoPort.Consume, 1)
-                        of_out.release(ObjectFifoPort.Produce, 1)
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(n, T.i32())
+
+        # AIE Core Function declarations
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                for _ in for_(N_div_n):
+                    elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                    elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    for i in for_(n):
+                        v0 = memref.load(elem_in1, [i])
+                        v1 = memref.load(elem_in2, [i])
+                        v2 = arith.muli(v0, v1)
+                        memref.store(v2, elem_out, [i])
                         yield_([])
+                    of_in1.release(ObjectFifoPort.Consume, 1)
+                    of_in2.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
-
-            # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(A, B, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
-
-
-my_vector_add()
+                yield_([])
+
+        # To/from AIE-array data movement
+        tensor_ty = T.memref(N, T.i32())
+
+        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_mul()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vector_vector_mul/test.cpp b/programming_examples/basic/vector_vector_mul/test.cpp
index 1ba8c8159e..5041f80fe3 100644
--- a/programming_examples/basic/vector_vector_mul/test.cpp
+++ b/programming_examples/basic/vector_vector_mul/test.cpp
@@ -20,79 +20,38 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-constexpr int IN_SIZE = 64;
-constexpr int OUT_SIZE = 64;
+#include "test_utils.h"
 
 namespace po = boost::program_options;
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
 int main(int argc, const char *argv[]) {
 
-  // Program arguments parsing
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
   po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
+  test_utils::add_default_options(desc);
 
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  constexpr int IN_SIZE = 1024;
+  constexpr int OUT_SIZE = 1024;
 
+  // Load instruction sequence
   std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // Start the XRT test code
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
   // Get a device handle
   unsigned int device_index = 0;
   auto device = xrt::device(device_index);
@@ -102,6 +61,7 @@ int main(int argc, const char *argv[]) {
     std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
   auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
 
+  // Load the kernel
   if (verbosity >= 1)
     std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
   std::string Node = vm["kernel"].as<std::string>();
@@ -109,29 +69,35 @@ int main(int argc, const char *argv[]) {
   // Get the kernel from the xclbin
   auto xkernels = xclbin.get_kernels();
   auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
                                  auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
                                  return name.rfind(Node, 0) == 0;
                                });
   auto kernelName = xkernel.get_name();
 
+  // Register xclbin
   if (verbosity >= 1)
     std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
               << "\n";
-
   device.register_xclbin(xclbin);
 
-  // get a hardware context
+  // Get a hardware context
   if (verbosity >= 1)
     std::cout << "Getting hardware context.\n";
   xrt::hw_context context(device, xclbin.get_uuid());
 
-  // get a kernel handle
+  // Get a kernel handle
   if (verbosity >= 1)
     std::cout << "Getting handle to kernel:" << kernelName << "\n";
   auto kernel = xrt::kernel(context, kernelName);
 
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
@@ -174,15 +140,14 @@ int main(int argc, const char *argv[]) {
 
   int errors = 0;
 
-  for (uint32_t i = 0; i < 64; i++) {
-    uint32_t ref = i + 2;
+  for (uint32_t i = 0; i < OUT_SIZE; i++) {
     if (*(bufOut + i) != *(bufInA + i) * *(bufInB + i)) {
       std::cout << "Error in output " << *(bufOut + i)
-                << " != " << *(bufInA + i) << " + " << *(bufInB + i)
+                << " != " << *(bufInA + i) << " * " << *(bufInB + i)
                 << std::endl;
       errors++;
     } else {
-      if (verbosity >= 1)
+      if (verbosity > 1)
         std::cout << "Correct output " << *(bufOut + i)
                   << " == " << *(bufInA + i) * *(bufInB + i) << std::endl;
     }