From 91247eab6fafc01a2e3d8dd552ef22e50574b9a8 Mon Sep 17 00:00:00 2001
From: Jeff Fifield <jeff.fifield@amd.com>
Date: Tue, 16 Apr 2024 14:18:11 -0600
Subject: [PATCH] move designs (#1233)  (#1265)

Co-authored-by: Philip James-Roxby <phil.jamesroxby@gmail.com>
Co-authored-by: pjr <pjr@xilinx.com>
Co-authored-by: Kristof Denolf <kristof.denolf@amd.com>
---
 aie_kernels/{ => aie2}/relu.cc                |   0
 programming_examples/basic/relu/test.cpp      | 326 ------------------
 .../basic/vector_softmax/aie2.py.orig         | 121 -------
 .../vector_softmax/bf16_softmax.mlir.orig     |  34 --
 .../basic/vector_softmax/sweep.py             |  20 --
 .../basic/vector_softmax/test.cpp             | 320 -----------------
 .../basic/vector_softmax/test.cpp.orig        | 325 -----------------
 .../{basic => ml}/relu/CMakeLists.txt         |   0
 .../{basic => ml}/relu/Makefile               |  27 +-
 .../{basic => ml}/relu/aie2.py                |   6 +-
 programming_examples/ml/relu/run.lit          |  11 +
 programming_examples/ml/relu/test.cpp         | 246 +++++++++++++
 .../vector_softmax/CMakeLists.txt             |   0
 .../{basic => ml}/vector_softmax/Makefile     |  12 +-
 .../{basic => ml}/vector_softmax/README.md    |   0
 .../{basic => ml}/vector_softmax/aie2.py      |   6 +-
 .../vector_softmax/bf16_softmax.mlir          |   0
 .../ml/vector_softmax/run.lit                 |  15 +
 .../exp.cc => ml/vector_softmax/softmax.cc}   |   3 +-
 .../ml/vector_softmax/test.cpp                | 256 ++++++++++++++
 20 files changed, 552 insertions(+), 1176 deletions(-)
 rename aie_kernels/{ => aie2}/relu.cc (100%)
 delete mode 100644 programming_examples/basic/relu/test.cpp
 delete mode 100755 programming_examples/basic/vector_softmax/aie2.py.orig
 delete mode 100644 programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig
 delete mode 100644 programming_examples/basic/vector_softmax/sweep.py
 delete mode 100644 programming_examples/basic/vector_softmax/test.cpp
 delete mode 100644 programming_examples/basic/vector_softmax/test.cpp.orig
 rename programming_examples/{basic => ml}/relu/CMakeLists.txt (100%)
 rename programming_examples/{basic => ml}/relu/Makefile (56%)
 rename programming_examples/{basic => ml}/relu/aie2.py (97%)
 create mode 100644 programming_examples/ml/relu/run.lit
 create mode 100644 programming_examples/ml/relu/test.cpp
 rename programming_examples/{basic => ml}/vector_softmax/CMakeLists.txt (100%)
 rename programming_examples/{basic => ml}/vector_softmax/Makefile (89%)
 rename programming_examples/{basic => ml}/vector_softmax/README.md (100%)
 rename programming_examples/{basic => ml}/vector_softmax/aie2.py (95%)
 rename programming_examples/{basic => ml}/vector_softmax/bf16_softmax.mlir (100%)
 create mode 100644 programming_examples/ml/vector_softmax/run.lit
 rename programming_examples/{basic/vector_softmax/exp.cc => ml/vector_softmax/softmax.cc} (84%)
 create mode 100644 programming_examples/ml/vector_softmax/test.cpp
diff --git a/aie_kernels/relu.cc b/aie_kernels/aie2/relu.cc
similarity index 100%
rename from aie_kernels/relu.cc
rename to aie_kernels/aie2/relu.cc
diff --git a/programming_examples/basic/relu/test.cpp b/programming_examples/basic/relu/test.cpp
deleted file mode 100644
index 14bb24babe..0000000000
--- a/programming_examples/basic/relu/test.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <bits/stdc++.h>
-#include <boost/program_options.hpp>
-#include <chrono>
-#include <cstdint>
-#include <cstdlib>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <math.h>
-#include <sstream>
-#include <stdfloat>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-constexpr bool VERIFY = true;
-
-constexpr int IN_SIZE = 65536;
-constexpr int OUT_SIZE = IN_SIZE;
-
-namespace po = boost::program_options;
-
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
-  std::ofstream fout(path);
-  uint32_t *traceOut = (uint32_t *)traceOutPtr;
-  for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
-    fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
-    fout << std::endl;
-  }
-}
-
-static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) {
-  // Random numbers should NOT be uniformly between 0 and 1, because that
-  // would make the matrix product AB always close to 1.
-  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias);
-}
-
-bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) {
-  std::bfloat16_t diff = fabs(a - b);
-  if ((diff / 4.0) < 0.001)
-    return true;
-  else
-    return false;
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  // Program arguments parsing
-  po::options_description desc("Allowed options");
-
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "trace_sz,t", po::value<int>()->default_value(0),
-      "the depth of the trace buffer")(
-      "trace_file,f", po::value<std::string>()->default_value("trace.txt"),
-      "the output trace path")("verbosity,v",
-                               po::value<int>()->default_value(0),
-                               "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
-  po::variables_map vm;
-
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
-
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
-
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  int trace_size = vm["trace_sz"].as<int>();
-
-  // Start the XRT test code
-  // Get a device handle
-  unsigned int device_index = 0;
-  auto device = xrt::device(device_index);
-
-  // Load the xclbin
-  if (verbosity >= 1)
-    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
-  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
-
-  if (verbosity >= 1)
-    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
-  std::string Node = vm["kernel"].as<std::string>();
-
-  // Get the kernel from the xclbin
-  auto xkernels = xclbin.get_kernels();
-  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
-                                 auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
-                                 return name.rfind(Node, 0) == 0;
-                               });
-  auto kernelName = xkernel.get_name();
-
-  if (verbosity >= 1)
-    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
-              << "\n";
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  if (verbosity >= 1)
-    std::cout << "Getting hardware context.\n";
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  if (verbosity >= 1)
-    std::cout << "Getting handle to kernel:" << kernelName << "\n";
-  auto kernel = xrt::kernel(context, kernelName);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-
-  auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size;
-  auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY,
-                        kernel.group_id(3));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  std::bfloat16_t *bufA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> AVec(IN_SIZE);
-  for (int i = 0; i < IN_SIZE; i++)
-    AVec[i] = random_bfloat16_t(4.0, 2.0);
-  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t)));
-
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  int sticky_errors = 0;
-
-  unsigned num_iter = 2;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
-
-    int errors = 0;
-
-    if (VERIFY) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-      for (uint32_t i = 0; i < IN_SIZE; i++) {
-        std::bfloat16_t ref = 0.0;
-        if (AVec[i] > 0.0)
-          ref = AVec[i];
-        if (!nearly_equal(*(bufOut + i), ref)) {
-          std::cout << "Error in " << i << " output " << *(bufOut + i)
-                    << " != " << ref << " actual max(" << AVec[i] << ", 0.0"
-                    << std::endl;
-          errors++;
-          sticky_errors++;
-        } else {
-          if (verbosity >= 2)
-            std::cout << "Correct " << i << " output " << *(bufOut + i)
-                      << " == " << ref << std::endl;
-        }
-      }
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: vector-scalar results not verified."
-                  << std::endl;
-    }
-
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-
-    if (trace_size > 0) {
-      write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size,
-                      vm["trace_file"].as<std::string>());
-    }
-
-    if (VERIFY) {
-      if (!errors) {
-        std::cout << iter << ": pass!\n";
-      } else {
-        std::cout << iter << ": fail! " << errors << " errors\n";
-      }
-    }
-  }
-
-  std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
-  std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
-
-  // Let's figure out how many cycles it takes a core to do a single e^x
-  // There are 4 cores, so the total number of e^x's it does is one quarter of
-  // the test size
-
-  int per_core_calcs = IN_SIZE / 4;
-  float avg_npu_time = npu_time_total / num_iter;
-  float avg_npu_clocks =
-      avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS
-  float clocks_per_calc = avg_npu_clocks / per_core_calcs;
-  std::cout << "Clocks per calc " << clocks_per_calc << std::endl;
-
-  // Lets benchmark the CPU
-  float cpu_time_total = 0;
-  float cpu_time_min = 9999999;
-  float cpu_time_max = 0;
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    std::vector<std::bfloat16_t> AVec(IN_SIZE);
-    std::vector<std::bfloat16_t> ResVec(IN_SIZE);
-    for (int i = 0; i < IN_SIZE; i++) {
-      AVec[i] = random_bfloat16_t(4.0, 2.0);
-    }
-    auto start = std::chrono::high_resolution_clock::now();
-    for (int i = 0; i < IN_SIZE; i++) {
-      ResVec[i] = exp(AVec[i]);
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    float cpu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    cpu_time_total += cpu_time;
-    cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
-    cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max;
-  }
-  std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl;
-  std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl;
-
-  if (VERIFY) {
-    if (!sticky_errors) {
-      std::cout << std::endl << "PASS!" << std::endl << std::endl;
-      return 0;
-    } else {
-      std::cout << std::endl << "FAIL." << std::endl << std::endl;
-      return 1;
-    }
-  } else {
-    std::cout << "Verification skipped, but I'm sure it worked.  I trust in you"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/programming_examples/basic/vector_softmax/aie2.py.orig b/programming_examples/basic/vector_softmax/aie2.py.orig
deleted file mode 100755
index bbdc8ab5c8..0000000000
--- a/programming_examples/basic/vector_softmax/aie2.py.orig
+++ /dev/null
@@ -1,121 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2023 AMD Inc.
-
-import sys
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.dialects.scf import *
-from aie.extras.context import mlir_mod_ctx
-
-
-def my_eltwise_add():
-
-    word_size_in = 2
-    N = 65536 #*1024
-    N_in_bytes = N * word_size_in
-
-    A_sz_in_i32s = N_in_bytes // 4
-    C_sz_in_i32s = N_in_bytes // 4
-
-    # Tile sizes
-    n = 1024
-    N_div_n = N // n
-
-    n_cores = 4
-    tiles = N_div_n // n_cores
-    buffer_depth = 2
-
-    with mlir_mod_ctx() as ctx:
-
-        @device(AIEDevice.ipu)
-        def device_body():
-            memRef_ty = T.memref(n, T.bf16())
-
-            # Type used in the tile memory
-            memRef_A_ty = T.memref(n, T.bf16())
-            memRef_C_ty = T.memref(n, T.bf16())
-
-            # Type used in the memory tile which aggregates across the 4 cores
-            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
-            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
-
-            # AIE Core Function declarations
-
-            exp_bf16_vector = external_func("exp_bf16_vector", inputs=[memRef_ty, memRef_ty])
-
-            # Tile declarations
-            ShimTile = tile(0, 0)
-
-            MemTile = tile(0, 1)
-            cores = [tile(0, 2 + i) for i in range(n_cores)]
-
-            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
-            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
-
-            inA_fifos = {}
-            outC_fifos = {}
-
-            # AIE-array data movement with object fifos
-            # Input A
-            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
-            for i in range(n_cores):
-                inA_fifos[inA_fifo_names[i]] = object_fifo(
-                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
-                )
-            object_fifo_link(inA, inA_fifo_names)
-
-            # Output C
-            for i in range(n_cores):
-                outC_fifos[outC_fifo_names[i]] = object_fifo(
-                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
-                )
-            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
-            object_fifo_link(outC_fifo_names[0:n_cores], outC)
-
-            # Set up compute tiles
-            for i in range(n_cores):
-                # Compute tile i
-                @core(cores[i], "kernels.a")
-                def core_body():
-                    for _ in for_(0xFFFFFFFF):
-                        for _ in for_(tiles):
-                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
-                                ObjectFifoPort.Produce, 1
-                            )
-                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
-                                ObjectFifoPort.Consume, 1
-                            )
-
-                            call(exp_bf16_vector,[elem_in_a, elem_out])
-
-                            inA_fifos[inA_fifo_names[i]].release(
-                                ObjectFifoPort.Consume, 1
-                            )
-                            outC_fifos[outC_fifo_names[i]].release(
-                                ObjectFifoPort.Produce, 1
-                            )
-                            yield_([])
-                        yield_([])
-
-            # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                ipu_dma_memcpy_nd(
-                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
-                )
-                ipu_dma_memcpy_nd(
-                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
-                )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
-
-
-my_eltwise_add()
diff --git a/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig b/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig
deleted file mode 100644
index cd72bcd0d3..0000000000
--- a/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig
+++ /dev/null
@@ -1,34 +0,0 @@
-module {
-  func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
-    %cst = arith.constant 0.000000e+00 : f32
-    %cst_0 = arith.constant 1.000000e+00 : f32
-    %cst_1 = arith.constant 0.000000e+00 : bf16
-    %cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
-    %0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
-      %5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
-      %6 = arith.maximumf %arg3, %5 : vector<32xbf16>
-      affine.yield %6 : vector<32xbf16>
-    }
-    %1 = vector.reduction <maximumf>, %0 : vector<32xbf16> into bf16
-    affine.for %arg2 = 0 to 1024 {
-      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
-      %6 = arith.subf %5, %1 : bf16
-      %7 = math.exp %6 : bf16
-      affine.store %7, %arg0[%arg2] : memref<1024xbf16>
-    }
-    %2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
-      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
-      %6 = arith.extf %5 : bf16 to f32
-      %7 = arith.addf %arg3, %6 : f32
-      affine.yield %7 : f32
-    }
-    %3 = arith.divf %cst_0, %2 : f32
-    %4 = arith.truncf %3 : f32 to bf16
-    affine.for %arg2 = 0 to 1024 {
-      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
-      %6 = arith.mulf %5, %4 : bf16
-      affine.store %6, %arg1[%arg2] : memref<1024xbf16>
-    }
-    return
-  }
-}
\ No newline at end of file
diff --git a/programming_examples/basic/vector_softmax/sweep.py b/programming_examples/basic/vector_softmax/sweep.py
deleted file mode 100644
index fabf5e70da..0000000000
--- a/programming_examples/basic/vector_softmax/sweep.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-
-for action in ["rm -f", "touch"]:
-    cmd = f"{action} results.csv"
-    os.system(cmd)
-
-
-for s in [16384, 32768, 65536, 131072, 262144]:
-    for i in [64, 128, 256, 512, 1024]:
-        for f in ["bf16_softmax.mlir", "test.cpp", "aie2.py"]:
-            sed = f"sed 's\\1024\\{i}\g' {f}.orig > {f}.first"
-            os.system(sed)
-            sed = f"sed 's\\65536\\{s}\g' {f}.first > {f}"
-            os.system(sed)
-        make_clean = f"make clean > /dev/null"
-        os.system(make_clean)
-        make_all = f"make all"
-        os.system(make_all)
-        make_profile = f"make profile"
-        os.system(make_profile)
diff --git a/programming_examples/basic/vector_softmax/test.cpp b/programming_examples/basic/vector_softmax/test.cpp
deleted file mode 100644
index 9354405139..0000000000
--- a/programming_examples/basic/vector_softmax/test.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <bits/stdc++.h>
-#include <boost/program_options.hpp>
-#include <chrono>
-#include <cstdint>
-#include <cstdlib>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <math.h>
-#include <sstream>
-#include <stdfloat>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-constexpr bool VERIFY = true;
-
-constexpr int IN_SIZE = 262144; //*1024;
-constexpr int TILE_SIZE = 1024;
-constexpr int OUT_SIZE = IN_SIZE;
-
-namespace po = boost::program_options;
-
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-static inline std::bfloat16_t random_bfloat16_t() {
-  // Random numbers should NOT be uniformly between 0 and 1, because that
-  // would make the matrix product AB always close to 1.
-  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
-}
-
-bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) {
-  std::bfloat16_t diff = fabs(a - b);
-  if ((diff / a) < 0.1)
-    return true;
-  else
-    return false;
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  // Program arguments parsing
-  po::options_description desc("Allowed options");
-
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "profile,p", po::value<std::string>()->default_value(""), "CSV profile")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
-  po::variables_map vm;
-
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
-
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
-
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // Start the XRT test code
-  // Get a device handle
-  unsigned int device_index = 0;
-  auto device = xrt::device(device_index);
-
-  // Load the xclbin
-  if (verbosity >= 1)
-    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
-  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
-
-  if (verbosity >= 1)
-    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
-  std::string Node = vm["kernel"].as<std::string>();
-
-  // Get the kernel from the xclbin
-  auto xkernels = xclbin.get_kernels();
-  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
-                                 auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
-                                 return name.rfind(Node, 0) == 0;
-                               });
-  auto kernelName = xkernel.get_name();
-
-  if (verbosity >= 1)
-    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
-              << "\n";
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  if (verbosity >= 1)
-    std::cout << "Getting hardware context.\n";
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  if (verbosity >= 1)
-    std::cout << "Getting handle to kernel:" << kernelName << "\n";
-  auto kernel = xrt::kernel(context, kernelName);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  std::bfloat16_t *bufA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> AVec(IN_SIZE);
-  for (int i = 0; i < IN_SIZE; i++)
-    AVec[i] = random_bfloat16_t() / 8.0;
-  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t)));
-
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  int sticky_errors = 0;
-
-  unsigned num_iter = 64;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-
-  // Lets also benchmark the CPU
-  float cpu_time_total = 0;
-  float cpu_time_min = 9999999;
-  float cpu_time_max = 0;
-
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
-
-    int errors = 0;
-
-    if (VERIFY) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-
-      std::vector<std::bfloat16_t> RefVec(IN_SIZE);
-      auto cpu_start = std::chrono::high_resolution_clock::now();
-
-      for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) {
-        float running = 0.0;
-        for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          float ez = (float)(exp(AVec[t + i]));
-          running += ez;
-          RefVec[t + i] = exp(AVec[t + i]);
-        }
-
-        for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          RefVec[t + i] /= running;
-        }
-      }
-      auto cpu_stop = std::chrono::high_resolution_clock::now();
-      float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(
-                           cpu_stop - cpu_start)
-                           .count();
-
-      cpu_time_total += cpu_time;
-      cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
-      cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max;
-
-      for (uint32_t i = 0; i < IN_SIZE; i++) {
-        std::bfloat16_t ref = RefVec[i];
-        if (!nearly_equal(*(bufOut + i), ref)) {
-          std::cout << "Error in " << i << " output " << *(bufOut + i)
-                    << " != " << ref << " actual e^" << AVec[i] << " : "
-                    << exp(AVec[i]) << std::endl;
-          errors++;
-          sticky_errors++;
-        } else {
-          if (verbosity >= 2)
-            std::cout << "Correct " << i << " output " << *(bufOut + i)
-                      << " == " << ref << std::endl;
-        }
-      }
-
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: vector-scalar results not verified."
-                  << std::endl;
-    }
-
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-
-    std::string profile = vm["profile"].as<std::string>();
-    if (profile.length()) {
-      std::ofstream of;
-      of.open(profile, std::ios::app); // Append
-      of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl;
-    }
-
-    if (VERIFY) {
-      if (!errors) {
-        std::cout << iter << ": pass! in " << npu_time << "us" << std::endl;
-      } else {
-        std::cout << iter << ": fail! " << errors << " errors in " << npu_time
-                  << "us" << std::endl;
-      }
-    }
-  }
-
-  std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min NPU exec time: " << npu_time_min << "us." << std::endl;
-  std::cout << "Max NPU exec time: " << npu_time_max << "us." << std::endl;
-
-  // Let's figure out how many cycles it takes a core to do a single e^x
-  // There are 4 cores, so the total number of e^x's it does is one quarter of
-  // the test size
-
-  int per_core_calcs = IN_SIZE / 4;
-  float avg_npu_time = npu_time_total / num_iter;
-  float avg_npu_clocks =
-      avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS
-  float clocks_per_calc = avg_npu_clocks / per_core_calcs;
-  std::cout << "Clocks per calc " << clocks_per_calc << std::endl;
-
-  std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min CPU exec time: " << cpu_time_min << "us." << std::endl;
-  std::cout << "Max CPU exec time: " << cpu_time_max << "us." << std::endl;
-
-  if (VERIFY) {
-    if (!sticky_errors) {
-      std::cout << std::endl << "PASS!" << std::endl << std::endl;
-      return 0;
-    } else {
-      std::cout << std::endl << "FAIL." << std::endl << std::endl;
-      return 1;
-    }
-  } else {
-    std::cout << "Verification skipped, but I'm sure it worked.  I trust in you"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/programming_examples/basic/vector_softmax/test.cpp.orig b/programming_examples/basic/vector_softmax/test.cpp.orig
deleted file mode 100644
index 2fa314e946..0000000000
--- a/programming_examples/basic/vector_softmax/test.cpp.orig
+++ /dev/null
@@ -1,325 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <bits/stdc++.h>
-#include <boost/program_options.hpp>
-#include <chrono>
-#include <cstdint>
-#include <cstdlib>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdfloat>
-#include <string>
-#include <vector>
-#include <math.h>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-constexpr bool VERIFY = true;
-
-constexpr int IN_SIZE = 65536; //*1024;
-constexpr int TILE_SIZE = 1024;
-constexpr int OUT_SIZE = IN_SIZE;
-
-namespace po = boost::program_options;
-
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-static inline std::bfloat16_t random_bfloat16_t() {
-  // Random numbers should NOT be uniformly between 0 and 1, because that
-  // would make the matrix product AB always close to 1.
-  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
-}
-
-bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) {
-  std::bfloat16_t diff = fabs(a - b);
-  if ((diff / a) < 0.1)
-    return true;
-  else
-    return false;
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  // Program arguments parsing
-  po::options_description desc("Allowed options");
-
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "profile,p", po::value<std::string>()->default_value(""),"CSV profile")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
-  po::variables_map vm;
-
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
-
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
-
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // Start the XRT test code
-  // Get a device handle
-  unsigned int device_index = 0;
-  auto device = xrt::device(device_index);
-
-  // Load the xclbin
-  if (verbosity >= 1)
-    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
-  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
-
-  if (verbosity >= 1)
-    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
-  std::string Node = vm["kernel"].as<std::string>();
-
-  // Get the kernel from the xclbin
-  auto xkernels = xclbin.get_kernels();
-  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
-                                 auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
-                                 return name.rfind(Node, 0) == 0;
-                               });
-  auto kernelName = xkernel.get_name();
-
-  if (verbosity >= 1)
-    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
-              << "\n";
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  if (verbosity >= 1)
-    std::cout << "Getting hardware context.\n";
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  if (verbosity >= 1)
-    std::cout << "Getting handle to kernel:" << kernelName << "\n";
-  auto kernel = xrt::kernel(context, kernelName);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  std::bfloat16_t *bufA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> AVec(IN_SIZE);
-  for (int i = 0; i < IN_SIZE; i++)
-    AVec[i] = random_bfloat16_t()/8.0;
-  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t)));
-
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  int sticky_errors = 0;
-
-  unsigned num_iter = 64;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-
-  // Lets also benchmark the CPU
-  float cpu_time_total = 0;
-  float cpu_time_min = 9999999;
-  float cpu_time_max = 0;
-
-
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
-
-    int errors = 0;
-
-    if (VERIFY) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-
-      std::vector<std::bfloat16_t> RefVec(IN_SIZE);
-      auto cpu_start = std::chrono::high_resolution_clock::now();
-
-      for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) {
-        float running = 0.0;
-        for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          float ez = (float)(exp(AVec[t+i]));
-          running += ez;
-          RefVec[t+i] = exp(AVec[t+i]);
-        }
-        
-        for (uint32_t i = 0; i < TILE_SIZE; i++) {
-          RefVec[t+i] /= running;
-        }
-      }      
-      auto cpu_stop = std::chrono::high_resolution_clock::now();
-      float cpu_time =
-          std::chrono::duration_cast<std::chrono::microseconds>(cpu_stop - cpu_start)
-              .count();
-
-      cpu_time_total += cpu_time;
-      cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
-      cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max;
-
-      
-
-      for (uint32_t i = 0; i < IN_SIZE; i++) {
-        std::bfloat16_t ref = RefVec[i];
-        if (!nearly_equal(*(bufOut + i), ref)) {
-          std::cout << "Error in " << i << " output " << *(bufOut + i)
-                    << " != " << ref << " actual e^" << AVec[i] << " : " << exp(AVec[i]) << std::endl;
-          errors++;
-          sticky_errors++;
-        } else {
-          if (verbosity >= 2)
-            std::cout << "Correct " << i << " output " << *(bufOut + i)
-                      << " == " << ref << std::endl;
-        }
-      }
-      
-
-
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: vector-scalar results not verified."
-                  << std::endl;
-    }
-
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-
-    std::string profile = vm["profile"].as<std::string>();
-    if (profile.length()) {
-      std::ofstream of;
-      of.open(profile, std::ios::app); // Append
-      of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl;
-    }
-
-    if (VERIFY) {
-      if (!errors) {
-        std::cout << iter << ": pass! in " << npu_time << "us" << std::endl;
-      } else {
-        std::cout << iter << ": fail! " << errors << " errors in " << npu_time << "us" << std::endl;
-      }
-    }
-  }
-
-  std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min NPU exec time: " << npu_time_min << "us." << std::endl;
-  std::cout << "Max NPU exec time: " << npu_time_max << "us." << std::endl;
-
-  // Let's figure out how many cycles it takes a core to do a single e^x
-  // There are 4 cores, so the total number of e^x's it does is one quarter of the test size
-
-  int per_core_calcs = IN_SIZE/4;
-  float avg_npu_time = npu_time_total / num_iter;
-  float avg_npu_clocks = avg_npu_time/1.0E-3;  // Time is in uS, but the AIE is clocked in nS
-  float clocks_per_calc = avg_npu_clocks/per_core_calcs;
-  std::cout << "Clocks per calc " << clocks_per_calc << std::endl;
-
-
-
-  
-  std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us."
-            << std::endl;
-  std::cout << "Min CPU exec time: " << cpu_time_min << "us." << std::endl;
-  std::cout << "Max CPU exec time: " << cpu_time_max << "us." << std::endl;
-
-
-  if (VERIFY) {
-    if (!sticky_errors) {
-      std::cout << std::endl << "PASS!" << std::endl << std::endl;
-      return 0;
-    } else {
-      std::cout << std::endl << "FAIL." << std::endl << std::endl;
-      return 1;
-    }
-  }
-  else {
-    std::cout << "Verification skipped, but I'm sure it worked.  I trust in you" << std::endl;
-  }
-  return 0;
-}
diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/ml/relu/CMakeLists.txt
similarity index 100%
rename from programming_examples/basic/relu/CMakeLists.txt
rename to programming_examples/ml/relu/CMakeLists.txt
diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/ml/relu/Makefile
similarity index 56%
rename from programming_examples/basic/relu/Makefile
rename to programming_examples/ml/relu/Makefile
index 87e836fbfb..2869ca2976 100644
--- a/programming_examples/basic/relu/Makefile
+++ b/programming_examples/ml/relu/Makefile
@@ -8,22 +8,22 @@
 
 include ../../makefile-common
 
-all: build/final.xclbin build/insts.txt
+all: build/final.xclbin
 
-targetname = testRelu
+targetname = myReLU
 
-build/bf16_relu.o: ../../../aie_kernels/relu.cc
+build/relu.o:
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c ../$< -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/relu.cc -o ${@F}
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/final.xclbin: build/aie.mlir build/bf16_relu.o
+build/final.xclbin: build/aie.mlir build/relu.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
@@ -39,15 +39,8 @@ endif
 run: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
-run_g: ${targetname}.exe build/final.xclbin build/insts.txt 
-	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
-trace:
-	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
-
-clean_trace:
-	rm -rf tmpTrace trace.txt
-
-clean: clean_trace
+clean:
 	rm -rf build _build ${targetname}.exe
-
diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/ml/relu/aie2.py
similarity index 97%
rename from programming_examples/basic/relu/aie2.py
rename to programming_examples/ml/relu/aie2.py
index 8204706127..6f3fe40ee0 100644
--- a/programming_examples/basic/relu/aie2.py
+++ b/programming_examples/ml/relu/aie2.py
@@ -49,7 +49,7 @@ def device_body():
 
             # AIE Core Function declarations
 
-            bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])
+            relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])
 
             # Tile declarations
             ShimTile = tile(0, 0)
@@ -87,7 +87,7 @@ def device_body():
             # Set up compute tiles
             for i in range(n_cores):
                 # Compute tile i
-                @core(cores[i], "bf16_relu.o")
+                @core(cores[i], "relu.o")
                 def core_body():
                     for _ in for_(0xFFFFFFFF):
                         for _ in for_(tiles):
@@ -98,7 +98,7 @@ def core_body():
                                 ObjectFifoPort.Consume, 1
                             )
 
-                            call(bf16_relu, [elem_in_a, elem_out])
+                            call(relu, [elem_in_a, elem_out])
 
                             inA_fifos[inA_fifo_names[i]].release(
                                 ObjectFifoPort.Consume, 1
diff --git a/programming_examples/ml/relu/run.lit b/programming_examples/ml/relu/run.lit
new file mode 100644
index 0000000000..16c48f2aeb
--- /dev/null
+++ b/programming_examples/ml/relu/run.lit
@@ -0,0 +1,11 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/relu.cc -o relu.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/ml/relu/test.cpp b/programming_examples/ml/relu/test.cpp
new file mode 100644
index 0000000000..170d90d9fd
--- /dev/null
+++ b/programming_examples/ml/relu/test.cpp
@@ -0,0 +1,246 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdfloat>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::bfloat16_t;
+using INOUT1_DATATYPE = std::bfloat16_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template <typename T>
+int verify(int size, std::vector<T> A, std::vector<T> B, int verbosity) {
+  int errors = 0;
+  for (uint32_t i = 0; i < size; i++) {
+    // If the input is nan, lets just say its good
+    if (isnan(A[i]))
+      continue;
+
+    T ref = (T)0;
+    if (A[i] > (T)0)
+      ref = A[i];
+    if (!test_utils::nearly_equal(ref, B[i])) {
+      std::cout << "Error in output " << B[i] << " != " << ref << " from "
+                << A[i] << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << B[i] << " == " << ref << std::endl;
+    }
+  }
+  return errors;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  int INOUT0_VOLUME = 65536;         // Input
+  int INOUT1_VOLUME = INOUT0_VOLUME; // Output
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  xrt::device device;
+  xrt::kernel kernel;
+
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // Assumes trace will only be added to inout1
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0 with ascending bfloat16 raw patterns
+  // All of them ...
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    uint16_t raw = (uint16_t)i;
+    AVec[i] = *(std::bfloat16_t *)(&raw);
+  }
+  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+  // Initialize Inout buffer 1 with zeros
+  char *bufInOut1 = bo_inout1.map<char *>();
+  memset(bufInOut1, 0, OUT_SIZE); // Zeroes out INOUT1_VOLUME + trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
+
+    memcpy(BVec.data(), bufInOut1, (BVec.size() * sizeof(INOUT1_DATATYPE)));
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      errors = verify(INOUT0_VOLUME, AVec, BVec, verbosity);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/vector_softmax/CMakeLists.txt b/programming_examples/ml/vector_softmax/CMakeLists.txt
similarity index 100%
rename from programming_examples/basic/vector_softmax/CMakeLists.txt
rename to programming_examples/ml/vector_softmax/CMakeLists.txt
diff --git a/programming_examples/basic/vector_softmax/Makefile b/programming_examples/ml/vector_softmax/Makefile
similarity index 89%
rename from programming_examples/basic/vector_softmax/Makefile
rename to programming_examples/ml/vector_softmax/Makefile
index eea6b707f5..4f27c07551 100755
--- a/programming_examples/basic/vector_softmax/Makefile
+++ b/programming_examples/ml/vector_softmax/Makefile
@@ -12,22 +12,22 @@ targetname = testExp
 
 all: build/final.xclbin build/insts.txt
 
-build/vecexp.cc: bf16_softmax.mlir
+build/dut.cc: bf16_softmax.mlir
 	mkdir -p ${@D}
-	cd ${@D} &&	aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc
+	cd ${@D} &&	aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o ${@F}
 
-build/vecexp.o: build/vecexp.cc
+build/dut.o: build/dut.cc
 	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
 build/lut_based_ops.o:
 	mkdir -p ${@D}
 	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
 
-build/exp.o: exp.cc
+build/softmax.o: softmax.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
-build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o
+build/kernels.a: build/softmax.o build/lut_based_ops.o build/dut.o
 	ar rvs $@ $+
 
 build/aie.mlir: aie2.py
@@ -63,5 +63,5 @@ clean_trace:
 	rm -rf tmpTrace trace.txt
 
 clean: clean_trace
-	rm -rf build _build ${targetname}.exe vecexp.cc
+	rm -rf build _build ${targetname}.exe 
 
diff --git a/programming_examples/basic/vector_softmax/README.md b/programming_examples/ml/vector_softmax/README.md
similarity index 100%
rename from programming_examples/basic/vector_softmax/README.md
rename to programming_examples/ml/vector_softmax/README.md
diff --git a/programming_examples/basic/vector_softmax/aie2.py b/programming_examples/ml/vector_softmax/aie2.py
similarity index 95%
rename from programming_examples/basic/vector_softmax/aie2.py
rename to programming_examples/ml/vector_softmax/aie2.py
index 05d4ed5be9..5672819f7a 100755
--- a/programming_examples/basic/vector_softmax/aie2.py
+++ b/programming_examples/ml/vector_softmax/aie2.py
@@ -46,8 +46,8 @@ def device_body():
 
             # AIE Core Function declarations
 
-            exp_bf16_vector = external_func(
-                "exp_bf16_vector", inputs=[memRef_ty, memRef_ty]
+            softmax_bf16_vector = external_func(
+                "softmax_bf16_vector", inputs=[memRef_ty, memRef_ty]
             )
 
             # Tile declarations
@@ -93,7 +93,7 @@ def core_body():
                                 ObjectFifoPort.Consume, 1
                             )
 
-                            call(exp_bf16_vector, [elem_in_a, elem_out])
+                            call(softmax_bf16_vector, [elem_in_a, elem_out])
 
                             inA_fifos[inA_fifo_names[i]].release(
                                 ObjectFifoPort.Consume, 1
diff --git a/programming_examples/basic/vector_softmax/bf16_softmax.mlir b/programming_examples/ml/vector_softmax/bf16_softmax.mlir
similarity index 100%
rename from programming_examples/basic/vector_softmax/bf16_softmax.mlir
rename to programming_examples/ml/vector_softmax/bf16_softmax.mlir
diff --git a/programming_examples/ml/vector_softmax/run.lit b/programming_examples/ml/vector_softmax/run.lit
new file mode 100644
index 0000000000..54c7ccff98
--- /dev/null
+++ b/programming_examples/ml/vector_softmax/run.lit
@@ -0,0 +1,15 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: aie-opt %S/bf16_softmax.mlir --affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -I%S/../../../aie_runtime_lib/AIE2 -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o lut_based_ops.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/softmax.cc -o softmax.o
+// RUN: ar rvs kernels.a dut.o lut_based_ops.o softmax.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/vector_softmax/exp.cc b/programming_examples/ml/vector_softmax/softmax.cc
similarity index 84%
rename from programming_examples/basic/vector_softmax/exp.cc
rename to programming_examples/ml/vector_softmax/softmax.cc
index 5fd060c239..6c4f9e27e1 100755
--- a/programming_examples/basic/vector_softmax/exp.cc
+++ b/programming_examples/ml/vector_softmax/softmax.cc
@@ -19,10 +19,11 @@
 
 #include <aie_api/aie.hpp>
 
+// Softmax DUT generated from vector dialect
 extern void dut(bfloat16 *a_in, bfloat16 *cout);
 
 extern "C" {
 
-void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }
+void softmax_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); }
 
 } // extern "C"
diff --git a/programming_examples/ml/vector_softmax/test.cpp b/programming_examples/ml/vector_softmax/test.cpp
new file mode 100644
index 0000000000..94767b139e
--- /dev/null
+++ b/programming_examples/ml/vector_softmax/test.cpp
@@ -0,0 +1,256 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdfloat>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+using INOUT0_DATATYPE = std::bfloat16_t;
+using INOUT1_DATATYPE = std::bfloat16_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template <typename T>
+int verify(int size, int tile_size, std::vector<T> A, std::vector<T> B,
+           int verbosity) {
+
+  int errors = 0;
+  std::vector<T> RefVec(size);
+
+  for (uint32_t t = 0; t < size; t += tile_size) {
+    float running = 0.0;
+    for (uint32_t i = 0; i < tile_size; i++) {
+      float ez = (float)(exp(A[t + i]));
+      running += ez;
+      RefVec[t + i] = exp(A[t + i]);
+    }
+
+    for (uint32_t i = 0; i < tile_size; i++) {
+      RefVec[t + i] /= running;
+    }
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+
+    if (!test_utils::nearly_equal(RefVec[i], B[i], 0.03125)) {
+      std::cout << "Error in output " << B[i] << " != " << RefVec[i]
+                << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << B[i] << " == " << RefVec[i]
+                  << std::endl;
+    }
+  }
+  return errors;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  int TILE_SIZE = 1024;
+  int INOUT0_VOLUME = 262144;        // Input
+  int INOUT1_VOLUME = INOUT0_VOLUME; // Output
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  xrt::device device;
+  xrt::kernel kernel;
+
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // Assumes trace will only be added to inout1
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0 with ascending bfloat16 raw patterns
+  // All of them ...
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)8.0,
+                                            (std::bfloat16_t)-4.0);
+  }
+  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+  // Initialize Inout buffer 1 with zeros
+  char *bufInOut1 = bo_inout1.map<char *>();
+  memset(bufInOut1, 0, OUT_SIZE); // Zeroes out INOUT1_VOLUME + trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
+
+    memcpy(BVec.data(), bufInOut1, (BVec.size() * sizeof(INOUT1_DATATYPE)));
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      errors = verify(INOUT0_VOLUME, TILE_SIZE, AVec, BVec, verbosity);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}