diff --git a/programming_examples/basic/eltwise_add/add.cc b/aie_kernels/aie2/add.cc
similarity index 100%
rename from programming_examples/basic/eltwise_add/add.cc
rename to aie_kernels/aie2/add.cc
diff --git a/aie_kernels/aie2/bf16_exp.cc b/aie_kernels/aie2/bf16_exp.cc
new file mode 100644
index 0000000000..304ce1f1dc
--- /dev/null
+++ b/aie_kernels/aie2/bf16_exp.cc
@@ -0,0 +1,23 @@
+#include <lut_based_ops.h>
+
+template <const int N>
+void exp_bf16_func(bfloat16 *restrict in, bfloat16 *restrict out) {
+
+  int vec_size = 16;
+  for (int i = 0; i < N; i += vec_size)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 vec_in = *(v16bfloat16 *)(in + i);
+      v16accfloat acc_exp = getExpBf16(vec_in);
+      v16bfloat16 bf16_exp = to_v16bfloat16(acc_exp);
+      *(v16bfloat16 *)(out + i) = bf16_exp;
+    }
+  return;
+}
+
+extern "C" {
+
+void exp_bf16_1024(bfloat16 *a_in, bfloat16 *c_out) {
+  exp_bf16_func<1024>(a_in, c_out);
+}
+
+} // extern "C"
diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc
new file mode 100644
index 0000000000..a2e87cffc4
--- /dev/null
+++ b/aie_kernels/relu.cc
@@ -0,0 +1,41 @@
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
+  const int v_factor = 32;
+  v32bfloat16 zeroes = broadcast_zero_bfloat16();
+
+  event0();
+  for (size_t i = 0; i < TILE_SIZE; i += v_factor)
+    chess_prepare_for_pipelining chess_loop_range(32, 32) {
+      v32bfloat16 input = *(v32bfloat16 *)(a + i);
+      v32bfloat16 output = max(input, zeroes);
+      *(v32bfloat16 *)(c + i) = output;
+    }
+  event1();
+  return;
+}
+
+extern "C" {
+
+void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }
+
+} // extern "C"
diff --git a/programming_examples/basic/eltwise_exp/CMakeLists.txt b/programming_examples/basic/eltwise_exp/CMakeLists.txt
new file mode 100644
index 0000000000..c64f84842b
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/eltwise_exp/Makefile b/programming_examples/basic/eltwise_exp/Makefile
new file mode 100644
index 0000000000..e34ff2dff8
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/Makefile
@@ -0,0 +1,55 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/basic/makefile-common
+
+all: build/final.xclbin
+
+targetname = eltwise_exp
+
+build/lut_based_ops.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+
+build/exp.o: 
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c ${REPO_ROOT}/aie_kernels/aie2/bf16_exp.cc -o ${@F}
+
+build/kernels.a: build/exp.o build/lut_based_ops.o
+	ar rvs $@ $+
+
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/eltwise_exp/README.md b/programming_examples/basic/eltwise_exp/README.md
new file mode 100644
index 0000000000..3e34f60ba0
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/README.md
@@ -0,0 +1,13 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Eltwise Exp</ins>
+
+A simple element wise exponent function, using the look up table capabilities of the AI Engine
\ No newline at end of file
diff --git a/programming_examples/basic/eltwise_exp/aie2.py b/programming_examples/basic/eltwise_exp/aie2.py
new file mode 100644
index 0000000000..40a4e87710
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/aie2.py
@@ -0,0 +1,129 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import *  # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx  # mlir ctx wrapper
+
+from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
+from aie.dialects.scf import *  # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_eltwise_exp():
+
+    word_size_in = 2
+    N = 65536
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 4
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    # ctx wrapper - to convert python to mlir
+    with mlir_mod_ctx() as ctx:
+
+        # Dvice declaration - aie2 device IPU (aka Ryzen AI)
+        @device(AIEDevice.ipu)
+        def device_body():
+
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            exp_bf16_1024 = external_func(
+                "exp_bf16_1024", inputs=[memRef_ty, memRef_ty]
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Compute tile bodies
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "kernels.a")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(exp_bf16_1024, [elem_in_a, elem_out])
+
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    # Print the mlir conversion
+    print(ctx.module)
+
+
+# Call design function to generate mlir code to stdout
+my_eltwise_exp()
diff --git a/programming_examples/basic/eltwise_exp/test.cpp b/programming_examples/basic/eltwise_exp/test.cpp
new file mode 100644
index 0000000000..0dac5179d1
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/test.cpp
@@ -0,0 +1,271 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+using INOUT0_DATATYPE = std::bfloat16_t;
+using INOUT1_DATATYPE = std::bfloat16_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template <typename T>
+int verify(int CSize, std::vector<T> A, std::vector<T> C, int verbosity) {
+  int errors = 0;
+  for (uint32_t i = 0; i < CSize; i++) {
+    std::bfloat16_t ref = exp(A[i]);
+    if (test_utils::nearly_equal(A[i], C[i])) {
+      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+    }
+  }
+  return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  int INOUT0_VOLUME = 65536; // Input only, 65536x bfloat16_t
+  int INOUT1_VOLUME = 65536; // Input only, 65536x bfloat16_t
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
+  for (int i = 0; i < INOUT0_VOLUME; i++)
+    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)4.0,
+                                            (std::bfloat16_t)0.0);
+
+  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+    std::bfloat16_t *bufOut = bo_inout1.map<std::bfloat16_t *>();
+
+    // Copy output results and verify they are correct
+    std::vector<INOUT1_DATATYPE> CVec(INOUT1_VOLUME);
+
+    memcpy(CVec.data(), bufOut, (CVec.size() * sizeof(INOUT1_DATATYPE)));
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      errors = verify(INOUT1_VOLUME, AVec, CVec, verbosity);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufOut) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/eltwise_exp/test.py b/programming_examples/basic/eltwise_exp/test.py
new file mode 100644
index 0000000000..cc132020d9
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/test.py
@@ -0,0 +1,157 @@
+# test.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# import argparse
+import numpy as np
+import pyxrt as xrt
+import sys
+import time
+
+sys.path.append("../../programming_examples/utils")
+import test_utils
+
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
+INOUT1_VOLUME = 64  # Not used in this example
+INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
+
+INOUT0_DATATYPE = np.uint32
+INOUT1_DATATYPE = np.uint32
+INOUT2_DATATYPE = np.uint32
+
+INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
+
+def main(opts):
+
+    # Load instruction sequence
+    with open(opts.instr, "r") as f:
+        instr_text = f.read().split("\n")
+        instr_text = [l for l in instr_text if l != ""]
+        instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+
+    # Get a device handle
+    device = xrt.device(0)
+
+    # Load the xclbin
+    xclbin = xrt.xclbin(opts.xclbin)
+
+    # Load the kernel
+    kernels = xclbin.get_kernels()
+    try:
+        xkernel = [k for k in kernels if opts.kernel in k.get_name()][0]
+    except:
+        print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'")
+        exit(-1)
+
+    # Register xclbin
+    device.register_xclbin(xclbin)
+
+    # Get a hardware context
+    context = xrt.hw_context(device, xclbin.get_uuid())
+
+    # get a kernel handle
+    kernel = xrt.kernel(context, xkernel.get_name())
+
+    # ------------------------------------------------------
+    # Initialize input/ output buffer sizes and sync them
+    # ------------------------------------------------------
+    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
+    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
+    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
+    bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
+
+    # Initialize instruction buffer
+    bo_instr.write(instr_v, 0)
+
+    # Initialize data buffers
+    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
+    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
+    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+    bo_inout0.write(inout0, 0)
+    bo_inout1.write(inout1, 0)
+    bo_inout2.write(inout2, 0)
+
+    # Sync buffers to update input buffer values
+    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+    # ------------------------------------------------------
+    # Initialize run configs
+    # ------------------------------------------------------
+    num_iter = opts.iters + opts.warmup_iters
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    errors = 0
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        # Run kernel
+        if opts.verbosity >= 1:
+            print("Running Kernel.")
+        start = time.time_ns()
+        h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
+        h.wait()
+        stop = time.time_ns()
+        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+        # Warmup iterations do not count towards average runtime.
+        if i < opts.warmup_iters:
+            continue
+
+        # Copy output results and verify they are correct
+        out_size = INOUT2_SIZE + opts.trace_size
+        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
+        if opts.verify:
+            if opts.verbosity >= 1:
+                print("Verifying results ...")
+            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+            e = np.equal(output_buffer, ref)
+            errors = errors + np.size(e) - np.count_nonzero(e)
+
+        # Write trace values if trace_size > 0
+        if opts.trace_size > 0:
+            print("Do something with trace!")
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+        npu_time_min = min(npu_time_min, npu_time)
+        npu_time_max = max(npu_time_max, npu_time)
+
+    # ------------------------------------------------------
+    # Print verification and timing results
+    # ------------------------------------------------------
+
+    # TODO - Mac count to guide gflops
+
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
+    print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
+    print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
+
+    if not errors:
+        print("\nPASS!\n")
+        exit(0)
+    else:
+        print("\nError count: ", errors)
+        print("\nFailed.\n")
+        exit(-1)
+
+
+if __name__ == "__main__":
+    opts = test_utils.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/basic/eltwise_mul/CMakeLists.txt b/programming_examples/basic/eltwise_mul/CMakeLists.txt
new file mode 100644
index 0000000000..c64f84842b
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/eltwise_mul/Makefile b/programming_examples/basic/eltwise_mul/Makefile
new file mode 100644
index 0000000000..363f12c4e4
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/Makefile
@@ -0,0 +1,47 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/basic/makefile-common
+
+all: build/final.xclbin
+
+targetname = myEltwiseMul
+
+build/mul.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/mul.cc -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir build/mul.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/eltwise_mul/README.md b/programming_examples/basic/eltwise_mul/README.md
new file mode 100644
index 0000000000..10dfc0a916
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/README.md
@@ -0,0 +1,20 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Section 3 - My First Program</ins>
+
+In this section, we'll put together what you learend in [section-1](../section-1) for defining a basic strucutral design in python and combine it with the data movement part from [section-2](../section-2) to build our first program. We will then run a simulation on this program as well as run this design on hardware (Ryzen AI).
+
+* Introduce example of first simple program (Bias Add)
+    * Walk through syntax of aie2.py, test.cpp, test_utils.h, maybe CMakeLists.txt and Makefile/ makefile-common as well
+    * need to remove trace parts from test.cpp for now and move it to Section-4
+
+* Illustrate how built-in simulation of single core design
+* Illustrate how to run designs on Ryzen AI enabled hardware
diff --git a/programming_examples/basic/eltwise_mul/aie2.py b/programming_examples/basic/eltwise_mul/aie2.py
new file mode 100644
index 0000000000..c5f15a459d
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/aie2.py
@@ -0,0 +1,151 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_eltwise_mul():
+
+    word_size_in = 2
+    N = 65536
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    B_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 2
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_B_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_B_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            eltwise_mul_bf16_scalar = external_func(
+                "eltwise_mul_bf16_scalar", inputs=[memRef_ty, memRef_ty, memRef_ty]
+            )
+            eltwise_mul_bf16_vector = external_func(
+                "eltwise_mul_bf16_vector", inputs=[memRef_ty, memRef_ty, memRef_ty]
+            )
+            # elwise_int32 = external_func("scale_int32", inputs=[memRef_ty, memRef_ty])
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            inB_fifo_names = [f"memB{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            inB_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Input B
+            inB = object_fifo("inB", ShimTile, MemTile, buffer_depth, memRef_B_MT_ty)
+            for i in range(n_cores):
+                inB_fifos[inB_fifo_names[i]] = object_fifo(
+                    inB_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_B_ty
+                )
+            object_fifo_link(inB, inB_fifo_names[0:n_cores])
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Set up compute tiles
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "mul.o")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            elem_in_b = inB_fifos[inB_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(
+                                eltwise_mul_bf16_vector,
+                                [elem_in_a, elem_in_b, elem_out],
+                            )
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            inB_fifos[inB_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_eltwise_mul()
diff --git a/programming_examples/basic/eltwise_mul/test.cpp b/programming_examples/basic/eltwise_mul/test.cpp
new file mode 100644
index 0000000000..c117c60c8f
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/test.cpp
@@ -0,0 +1,297 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::bfloat16_t;
+using INOUT1_DATATYPE = std::bfloat16_t;
+using INOUT2_DATATYPE = std::bfloat16_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template <typename T>
+int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
+           int verbosity) {
+  int errors = 0;
+  for (uint32_t i = 0; i < size; i++) {
+    T ref = A[i] * B[i];
+    if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
+      std::cout << "Error in output " << C[i] << " != " << ref << " from "
+                << A[i] << " * " << B[i] << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+    }
+  }
+  return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 65536;         // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = INOUT0_VOLUME; // Not used in this example
+  int INOUT2_VOLUME =
+      INOUT0_VOLUME; // Output only, 64x uint32_t in this example
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // Assumes trace will only be added to inout2
+  auto bo_inout2 =
+      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
+  for (int i = 0; i < INOUT0_VOLUME; i++)
+    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
+                                            (std::bfloat16_t)-0.5);
+  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+  // Initialize Inout buffer 1
+  INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
+  for (int i = 0; i < INOUT1_VOLUME; i++)
+    BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
+                                            (std::bfloat16_t)-0.5);
+  memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
+
+  // Initialize Inout buffer 2
+  char *bufInOut2 = bo_inout2.map<char *>();
+  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
+  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run =
+        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      errors = verify(INOUT0_VOLUME, AVec, BVec, CVec, verbosity);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/eltwise_mul/test.py b/programming_examples/basic/eltwise_mul/test.py
new file mode 100644
index 0000000000..cc132020d9
--- /dev/null
+++ b/programming_examples/basic/eltwise_mul/test.py
@@ -0,0 +1,157 @@
+# test.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# import argparse
+import numpy as np
+import pyxrt as xrt
+import sys
+import time
+
+sys.path.append("../../programming_examples/utils")
+import test_utils
+
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
+INOUT1_VOLUME = 64  # Not used in this example
+INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
+
+INOUT0_DATATYPE = np.uint32
+INOUT1_DATATYPE = np.uint32
+INOUT2_DATATYPE = np.uint32
+
+INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
+
+def main(opts):
+
+    # Load instruction sequence
+    with open(opts.instr, "r") as f:
+        instr_text = f.read().split("\n")
+        instr_text = [l for l in instr_text if l != ""]
+        instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+
+    # Get a device handle
+    device = xrt.device(0)
+
+    # Load the xclbin
+    xclbin = xrt.xclbin(opts.xclbin)
+
+    # Load the kernel
+    kernels = xclbin.get_kernels()
+    try:
+        xkernel = [k for k in kernels if opts.kernel in k.get_name()][0]
+    except:
+        print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'")
+        exit(-1)
+
+    # Register xclbin
+    device.register_xclbin(xclbin)
+
+    # Get a hardware context
+    context = xrt.hw_context(device, xclbin.get_uuid())
+
+    # get a kernel handle
+    kernel = xrt.kernel(context, xkernel.get_name())
+
+    # ------------------------------------------------------
+    # Initialize input/ output buffer sizes and sync them
+    # ------------------------------------------------------
+    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
+    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
+    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
+    bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
+
+    # Initialize instruction buffer
+    bo_instr.write(instr_v, 0)
+
+    # Initialize data buffers
+    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
+    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
+    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+    bo_inout0.write(inout0, 0)
+    bo_inout1.write(inout1, 0)
+    bo_inout2.write(inout2, 0)
+
+    # Sync buffers to update input buffer values
+    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+    # ------------------------------------------------------
+    # Initialize run configs
+    # ------------------------------------------------------
+    num_iter = opts.iters + opts.warmup_iters
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    errors = 0
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        # Run kernel
+        if opts.verbosity >= 1:
+            print("Running Kernel.")
+        start = time.time_ns()
+        h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
+        h.wait()
+        stop = time.time_ns()
+        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+        # Warmup iterations do not count towards average runtime.
+        if i < opts.warmup_iters:
+            continue
+
+        # Copy output results and verify they are correct
+        out_size = INOUT2_SIZE + opts.trace_size
+        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
+        if opts.verify:
+            if opts.verbosity >= 1:
+                print("Verifying results ...")
+            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+            e = np.equal(output_buffer, ref)
+            errors = errors + np.size(e) - np.count_nonzero(e)
+
+        # Write trace values if trace_size > 0
+        if opts.trace_size > 0:
+            print("Do something with trace!")
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+        npu_time_min = min(npu_time_min, npu_time)
+        npu_time_max = max(npu_time_max, npu_time)
+
+    # ------------------------------------------------------
+    # Print verification and timing results
+    # ------------------------------------------------------
+
+    # TODO - Mac count to guide gflops
+
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
+    print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
+    print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
+
+    if not errors:
+        print("\nPASS!\n")
+        exit(0)
+    else:
+        print("\nError count: ", errors)
+        print("\nFailed.\n")
+        exit(-1)
+
+
+if __name__ == "__main__":
+    opts = test_utils.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/basic/eltwise_add/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt
similarity index 100%
rename from programming_examples/basic/eltwise_add/CMakeLists.txt
rename to programming_examples/basic/relu/CMakeLists.txt
diff --git a/programming_examples/basic/eltwise_add/Makefile b/programming_examples/basic/relu/Makefile
old mode 100755
new mode 100644
similarity index 77%
rename from programming_examples/basic/eltwise_add/Makefile
rename to programming_examples/basic/relu/Makefile
index 98e5a70f1f..f862aad403
--- a/programming_examples/basic/eltwise_add/Makefile
+++ b/programming_examples/basic/relu/Makefile
@@ -8,19 +8,19 @@
 
 include ../makefile-common
 
-targetname = eltwiseAdd
+targetname = testRelu
 
 all: build/final.xclbin build/insts.txt
 
-build/%.o: %.cc
+build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F}
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/final.xclbin: build/aie.mlir build/add.o
+build/final.xclbin: build/aie.mlir build/bf16_relu.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
 				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
@@ -39,6 +39,10 @@ endif
 run: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
+run_g: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536
+
+
 trace:
 	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
 
diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/basic/relu/aie2.py
new file mode 100644
index 0000000000..8204706127
--- /dev/null
+++ b/programming_examples/basic/relu/aie2.py
@@ -0,0 +1,209 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_relu():
+
+    word_size_in = 2
+    N = 65536
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    enable_tracing = True
+    trace_size = 65536
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 2
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Set up a circuit-switched flow from core to shim for tracing information
+            if enable_tracing:
+                flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # Set up compute tiles
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "bf16_relu.o")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(bf16_relu, [elem_in_a, elem_out])
+
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+
+                # Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
+                if enable_tracing:
+                    # 0x340D0: Trace Control 0
+                    #          0xAABB---C
+                    #            AA        <- Event to stop trace capture
+                    #              BB      <- Event to start trace capture
+                    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
+                    # Configure so that "Event 1" (always true) causes tracing to start
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340D0,
+                        value=0x00010000,
+                    )
+                    # 0x340D4: Trace Control 1
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340D4,
+                        value=0x00000000,
+                    )
+                    # 0x340E0: Trace Event Group 1  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340E0,
+                        value=0x00222100,
+                    )
+                    # 0x340E4: Trace Event Group 2  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340E4,
+                        value=0x00000000,
+                    )
+
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x3FF00,
+                        value=0x00000121,
+                    )
+
+                    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
+                    # out to host DDR memory
+                    trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
+                    output_size = N_in_bytes
+                    ipu_writebd_shimtile(
+                        bd_id=trace_bd_id,
+                        buffer_length=trace_size,
+                        buffer_offset=output_size,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_size=0,
+                        d0_stride=0,
+                        d1_size=0,
+                        d1_stride=0,
+                        d2_stride=0,
+                        ddr_id=1,
+                        iteration_current=0,
+                        iteration_size=0,
+                        iteration_stride=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    # Set start BD to our shim bd_Id (13)
+                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_relu()
diff --git a/programming_examples/basic/eltwise_add/test.cpp b/programming_examples/basic/relu/test.cpp
similarity index 65%
rename from programming_examples/basic/eltwise_add/test.cpp
rename to programming_examples/basic/relu/test.cpp
index 67ca9d2e97..14bb24babe 100644
--- a/programming_examples/basic/eltwise_add/test.cpp
+++ b/programming_examples/basic/relu/test.cpp
@@ -16,6 +16,7 @@
 #include <ctime>
 #include <fstream>
 #include <iostream>
+#include <math.h>
 #include <sstream>
 #include <stdfloat>
 #include <string>
@@ -45,15 +46,24 @@ void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
   }
 }
 
-static inline std::bfloat16_t random_bfloat16_t() {
+void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
+  std::ofstream fout(path);
+  uint32_t *traceOut = (uint32_t *)traceOutPtr;
+  for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
+    fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
+    fout << std::endl;
+  }
+}
+
+static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) {
   // Random numbers should NOT be uniformly between 0 and 1, because that
   // would make the matrix product AB always close to 1.
-  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
+  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias);
 }
 
 bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) {
   std::bfloat16_t diff = fabs(a - b);
-  if ((diff / a) < 0.01)
+  if ((diff / 4.0) < 0.001)
     return true;
   else
     return false;
@@ -84,8 +94,12 @@ int main(int argc, const char *argv[]) {
       "the input xclbin path")(
       "kernel,k", po::value<std::string>()->required(),
       "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
+      "trace_sz,t", po::value<int>()->default_value(0),
+      "the depth of the trace buffer")(
+      "trace_file,f", po::value<std::string>()->default_value("trace.txt"),
+      "the output trace path")("verbosity,v",
+                               po::value<int>()->default_value(0),
+                               "the verbosity of the output")(
       "instr,i", po::value<std::string>()->required(),
       "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
@@ -114,6 +128,8 @@ int main(int argc, const char *argv[]) {
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
+  int trace_size = vm["trace_sz"].as<int>();
+
   // Start the XRT test code
   // Get a device handle
   unsigned int device_index = 0;
@@ -158,10 +174,10 @@ int main(int argc, const char *argv[]) {
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size;
+  auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY,
+                        kernel.group_id(3));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
@@ -169,25 +185,18 @@ int main(int argc, const char *argv[]) {
   std::bfloat16_t *bufA = bo_inA.map<std::bfloat16_t *>();
   std::vector<std::bfloat16_t> AVec(IN_SIZE);
   for (int i = 0; i < IN_SIZE; i++)
-    AVec[i] = random_bfloat16_t();
+    AVec[i] = random_bfloat16_t(4.0, 2.0);
   memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t)));
 
-  std::bfloat16_t *bufB = bo_inB.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> BVec(IN_SIZE);
-  for (int i = 0; i < IN_SIZE; i++)
-    BVec[i] = random_bfloat16_t();
-  memcpy(bufB, BVec.data(), (BVec.size() * sizeof(std::bfloat16_t)));
-
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
   bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
   int sticky_errors = 0;
 
-  unsigned num_iter = 256;
+  unsigned num_iter = 2;
   float npu_time_total = 0;
   float npu_time_min = 9999999;
   float npu_time_max = 0;
@@ -198,7 +207,7 @@ int main(int argc, const char *argv[]) {
 
     auto start = std::chrono::high_resolution_clock::now();
 
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
     run.wait();
     auto stop = std::chrono::high_resolution_clock::now();
 
@@ -213,11 +222,13 @@ int main(int argc, const char *argv[]) {
         std::cout << "Verifying results ..." << std::endl;
       }
       for (uint32_t i = 0; i < IN_SIZE; i++) {
-        std::bfloat16_t ref = AVec[i] + BVec[i];
+        std::bfloat16_t ref = 0.0;
+        if (AVec[i] > 0.0)
+          ref = AVec[i];
         if (!nearly_equal(*(bufOut + i), ref)) {
           std::cout << "Error in " << i << " output " << *(bufOut + i)
-                    << " != " << ref << " actual " << AVec[i] << " + "
-                    << BVec[i] << std::endl;
+                    << " != " << ref << " actual max(" << AVec[i] << ", 0.0"
+                    << std::endl;
           errors++;
           sticky_errors++;
         } else {
@@ -240,10 +251,17 @@ int main(int argc, const char *argv[]) {
     npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
     npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
 
-    if (VERIFY && !errors) {
-      std::cout << iter << ": pass!\n";
-    } else {
-      std::cout << iter << ": fail! " << errors << " errors\n";
+    if (trace_size > 0) {
+      write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size,
+                      vm["trace_file"].as<std::string>());
+    }
+
+    if (VERIFY) {
+      if (!errors) {
+        std::cout << iter << ": pass!\n";
+      } else {
+        std::cout << iter << ": fail! " << errors << " errors\n";
+      }
     }
   }
 
@@ -252,11 +270,57 @@ int main(int argc, const char *argv[]) {
   std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
   std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
 
-  if (VERIFY && !sticky_errors) {
-    std::cout << "\nPASS!\n\n";
-    return 0;
+  // Let's figure out how many cycles it takes a core to do a single e^x
+  // There are 4 cores, so the total number of e^x's it does is one quarter of
+  // the test size
+
+  int per_core_calcs = IN_SIZE / 4;
+  float avg_npu_time = npu_time_total / num_iter;
+  float avg_npu_clocks =
+      avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS
+  float clocks_per_calc = avg_npu_clocks / per_core_calcs;
+  std::cout << "Clocks per calc " << clocks_per_calc << std::endl;
+
+  // Lets benchmark the CPU
+  float cpu_time_total = 0;
+  float cpu_time_min = 9999999;
+  float cpu_time_max = 0;
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    std::vector<std::bfloat16_t> AVec(IN_SIZE);
+    std::vector<std::bfloat16_t> ResVec(IN_SIZE);
+    for (int i = 0; i < IN_SIZE; i++) {
+      AVec[i] = random_bfloat16_t(4.0, 2.0);
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < IN_SIZE; i++) {
+      ResVec[i] = exp(AVec[i]);
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    float cpu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    cpu_time_total += cpu_time;
+    cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
+    cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max;
+  }
+  std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl;
+  std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl;
+
+  if (VERIFY) {
+    if (!sticky_errors) {
+      std::cout << std::endl << "PASS!" << std::endl << std::endl;
+      return 0;
+    } else {
+      std::cout << std::endl << "FAIL." << std::endl << std::endl;
+      return 1;
+    }
   } else {
-    std::cout << "\nFAIL.\n\n";
-    return 1;
+    std::cout << "Verification skipped, but I'm sure it worked.  I trust in you"
+              << std::endl;
   }
+  return 0;
 }
diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt
new file mode 100644
index 0000000000..c64f84842b
--- /dev/null
+++ b/programming_examples/ml/eltwise_add/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile
new file mode 100644
index 0000000000..dd75274321
--- /dev/null
+++ b/programming_examples/ml/eltwise_add/Makefile
@@ -0,0 +1,47 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/basic/makefile-common
+
+all: build/final.xclbin
+
+targetname = myEltwiseAdd
+
+build/add.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/add.cc -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir build/add.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/ml/eltwise_add/README.md b/programming_examples/ml/eltwise_add/README.md
new file mode 100644
index 0000000000..10dfc0a916
--- /dev/null
+++ b/programming_examples/ml/eltwise_add/README.md
@@ -0,0 +1,20 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Section 3 - My First Program</ins>
+
+In this section, we'll put together what you learend in [section-1](../section-1) for defining a basic strucutral design in python and combine it with the data movement part from [section-2](../section-2) to build our first program. We will then run a simulation on this program as well as run this design on hardware (Ryzen AI).
+
+* Introduce example of first simple program (Bias Add)
+    * Walk through syntax of aie2.py, test.cpp, test_utils.h, maybe CMakeLists.txt and Makefile/ makefile-common as well
+    * need to remove trace parts from test.cpp for now and move it to Section-4
+
+* Illustrate how built-in simulation of single core design
+* Illustrate how to run designs on Ryzen AI enabled hardware
diff --git a/programming_examples/basic/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py
old mode 100755
new mode 100644
similarity index 100%
rename from programming_examples/basic/eltwise_add/aie2.py
rename to programming_examples/ml/eltwise_add/aie2.py
diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp
new file mode 100644
index 0000000000..eb38eeb1de
--- /dev/null
+++ b/programming_examples/ml/eltwise_add/test.cpp
@@ -0,0 +1,297 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::bfloat16_t;
+using INOUT1_DATATYPE = std::bfloat16_t;
+using INOUT2_DATATYPE = std::bfloat16_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template <typename T>
+int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
+           int verbosity) {
+  int errors = 0;
+  for (uint32_t i = 0; i < size; i++) {
+    T ref = A[i] + B[i];
+    if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
+      std::cout << "Error in output " << C[i] << " != " << ref << " from "
+                << A[i] << " + " << B[i] << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+    }
+  }
+  return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 65536;         // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = INOUT0_VOLUME; // Not used in this example
+  int INOUT2_VOLUME =
+      INOUT0_VOLUME; // Output only, 64x uint32_t in this example
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // Assumes trace will only be added to inout2
+  auto bo_inout2 =
+      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
+  for (int i = 0; i < INOUT0_VOLUME; i++)
+    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
+                                            (std::bfloat16_t)-0.5);
+  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+  // Initialize Inout buffer 1
+  INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
+  std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
+  for (int i = 0; i < INOUT1_VOLUME; i++)
+    BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
+                                            (std::bfloat16_t)-0.5);
+  memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
+
+  // Initialize Inout buffer 2
+  char *bufInOut2 = bo_inout2.map<char *>();
+  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
+  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run =
+        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      errors = verify(INOUT0_VOLUME, AVec, BVec, CVec, verbosity);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/ml/eltwise_add/test.py b/programming_examples/ml/eltwise_add/test.py
new file mode 100644
index 0000000000..cc132020d9
--- /dev/null
+++ b/programming_examples/ml/eltwise_add/test.py
@@ -0,0 +1,157 @@
+# test.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# import argparse
+import numpy as np
+import pyxrt as xrt
+import sys
+import time
+
+sys.path.append("../../programming_examples/utils")
+import test_utils
+
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
+INOUT1_VOLUME = 64  # Not used in this example
+INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
+
+INOUT0_DATATYPE = np.uint32
+INOUT1_DATATYPE = np.uint32
+INOUT2_DATATYPE = np.uint32
+
+INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
+
+def main(opts):
+
+    # Load instruction sequence
+    with open(opts.instr, "r") as f:
+        instr_text = f.read().split("\n")
+        instr_text = [l for l in instr_text if l != ""]
+        instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+
+    # Get a device handle
+    device = xrt.device(0)
+
+    # Load the xclbin
+    xclbin = xrt.xclbin(opts.xclbin)
+
+    # Load the kernel
+    kernels = xclbin.get_kernels()
+    try:
+        xkernel = [k for k in kernels if opts.kernel in k.get_name()][0]
+    except:
+        print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'")
+        exit(-1)
+
+    # Register xclbin
+    device.register_xclbin(xclbin)
+
+    # Get a hardware context
+    context = xrt.hw_context(device, xclbin.get_uuid())
+
+    # get a kernel handle
+    kernel = xrt.kernel(context, xkernel.get_name())
+
+    # ------------------------------------------------------
+    # Initialize input/ output buffer sizes and sync them
+    # ------------------------------------------------------
+    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
+    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
+    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
+    bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
+
+    # Initialize instruction buffer
+    bo_instr.write(instr_v, 0)
+
+    # Initialize data buffers
+    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
+    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
+    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+    bo_inout0.write(inout0, 0)
+    bo_inout1.write(inout1, 0)
+    bo_inout2.write(inout2, 0)
+
+    # Sync buffers to update input buffer values
+    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+    # ------------------------------------------------------
+    # Initialize run configs
+    # ------------------------------------------------------
+    num_iter = opts.iters + opts.warmup_iters
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    errors = 0
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        # Run kernel
+        if opts.verbosity >= 1:
+            print("Running Kernel.")
+        start = time.time_ns()
+        h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
+        h.wait()
+        stop = time.time_ns()
+        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+        # Warmup iterations do not count towards average runtime.
+        if i < opts.warmup_iters:
+            continue
+
+        # Copy output results and verify they are correct
+        out_size = INOUT2_SIZE + opts.trace_size
+        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
+        if opts.verify:
+            if opts.verbosity >= 1:
+                print("Verifying results ...")
+            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+            e = np.equal(output_buffer, ref)
+            errors = errors + np.size(e) - np.count_nonzero(e)
+
+        # Write trace values if trace_size > 0
+        if opts.trace_size > 0:
+            print("Do something with trace!")
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+        npu_time_min = min(npu_time_min, npu_time)
+        npu_time_max = max(npu_time_max, npu_time)
+
+    # ------------------------------------------------------
+    # Print verification and timing results
+    # ------------------------------------------------------
+
+    # TODO - Mac count to guide gflops
+
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
+    print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
+    print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
+
+    if not errors:
+        print("\nPASS!\n")
+        exit(0)
+    else:
+        print("\nError count: ", errors)
+        print("\nFailed.\n")
+        exit(-1)
+
+
+if __name__ == "__main__":
+    opts = test_utils.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/utils/test_utils.h b/programming_examples/utils/test_utils.h
index 8eedcde857..083760942e 100644
--- a/programming_examples/utils/test_utils.h
+++ b/programming_examples/utils/test_utils.h
@@ -13,6 +13,8 @@
 #ifndef TEST_UTILS_H
 #define TEST_UTILS_H
 
+#include <bits/stdc++.h>
+
 #include <boost/program_options.hpp>
 #include <cmath>
 
@@ -103,6 +105,13 @@ static inline std::int16_t random_int16_t() {
   return (std::int16_t)rand() % 0x10000;
 }
 
+static inline std::bfloat16_t random_bfloat16_t(std::bfloat16_t scale,
+                                                std::bfloat16_t bias) {
+  // Random numbers should NOT be uniformly between 0 and 1, because that
+  // would make the matrix product AB always close to 1.
+  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) + bias);
+}
+
 // static inline std::bfloat16_t random_bfloat16_t() {
 //   // Random numbers should NOT be uniformly between 0 and 1, because that
 //   // would make the matrix product AB always close to 1.