diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt
new file mode 100644
index 0000000000..d9f511062f
--- /dev/null
+++ b/programming_examples/basic/relu/CMakeLists.txt
@@ -0,0 +1,68 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/basic/relu/Makefile
new file mode 100644
index 0000000000..f862aad403
--- /dev/null
+++ b/programming_examples/basic/relu/Makefile
@@ -0,0 +1,54 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../makefile-common
+
+targetname = testRelu
+
+all: build/final.xclbin build/insts.txt
+
+build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir build/bf16_relu.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_g: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536
+
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
+
+clean: clean_trace
+	rm -rf build _build ${targetname}.exe
+
diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/basic/relu/aie2.py
new file mode 100644
index 0000000000..8204706127
--- /dev/null
+++ b/programming_examples/basic/relu/aie2.py
@@ -0,0 +1,209 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_relu():
+
+    word_size_in = 2
+    N = 65536
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    enable_tracing = True
+    trace_size = 65536
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 2
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Set up a circuit-switched flow from core to shim for tracing information
+            if enable_tracing:
+                flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # Set up compute tiles
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "bf16_relu.o")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(bf16_relu, [elem_in_a, elem_out])
+
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+
+                # Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
+                if enable_tracing:
+                    # 0x340D0: Trace Control 0
+                    #          0xAABB---C
+                    #            AA        <- Event to stop trace capture
+                    #              BB      <- Event to start trace capture
+                    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
+                    # Configure so that "Event 1" (always true) causes tracing to start
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340D0,
+                        value=0x00010000,
+                    )
+                    # 0x340D4: Trace Control 1
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340D4,
+                        value=0x00000000,
+                    )
+                    # 0x340E0: Trace Event Group 1  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340E0,
+                        value=0x00222100,
+                    )
+                    # 0x340E4: Trace Event Group 2  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x340E4,
+                        value=0x00000000,
+                    )
+
+                    ipu_write32(
+                        column=0,
+                        row=2,
+                        address=0x3FF00,
+                        value=0x00000121,
+                    )
+
+                    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
+                    # out to host DDR memory
+                    trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
+                    output_size = N_in_bytes
+                    ipu_writebd_shimtile(
+                        bd_id=trace_bd_id,
+                        buffer_length=trace_size,
+                        buffer_offset=output_size,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_size=0,
+                        d0_stride=0,
+                        d1_size=0,
+                        d1_stride=0,
+                        d2_stride=0,
+                        ddr_id=1,
+                        iteration_current=0,
+                        iteration_size=0,
+                        iteration_stride=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    # Set start BD to our shim bd_Id (13)
+                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_relu()
diff --git a/programming_examples/basic/relu/test.cpp b/programming_examples/basic/relu/test.cpp
new file mode 100644
index 0000000000..14bb24babe
--- /dev/null
+++ b/programming_examples/basic/relu/test.cpp
@@ -0,0 +1,326 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <sstream>
+#include <stdfloat>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr bool VERIFY = true;
+
+constexpr int IN_SIZE = 65536;
+constexpr int OUT_SIZE = IN_SIZE;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
+  std::ofstream fout(path);
+  uint32_t *traceOut = (uint32_t *)traceOutPtr;
+  for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
+    fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
+    fout << std::endl;
+  }
+}
+
+static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) {
+  // Random numbers should NOT be uniformly between 0 and 1, because that
+  // would make the matrix product AB always close to 1.
+  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias);
+}
+
+bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) {
+  std::bfloat16_t diff = fabs(a - b);
+  if ((diff / 4.0) < 0.001)
+    return true;
+  else
+    return false;
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "trace_sz,t", po::value<int>()->default_value(0),
+      "the depth of the trace buffer")(
+      "trace_file,f", po::value<std::string>()->default_value("trace.txt"),
+      "the output trace path")("verbosity,v",
+                               po::value<int>()->default_value(0),
+                               "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+
+  auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size;
+  auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY,
+                        kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  std::bfloat16_t *bufA = bo_inA.map<std::bfloat16_t *>();
+  std::vector<std::bfloat16_t> AVec(IN_SIZE);
+  for (int i = 0; i < IN_SIZE; i++)
+    AVec[i] = random_bfloat16_t(4.0, 2.0);
+  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t)));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  int sticky_errors = 0;
+
+  unsigned num_iter = 2;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+
+    int errors = 0;
+
+    if (VERIFY) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      for (uint32_t i = 0; i < IN_SIZE; i++) {
+        std::bfloat16_t ref = 0.0;
+        if (AVec[i] > 0.0)
+          ref = AVec[i];
+        if (!nearly_equal(*(bufOut + i), ref)) {
+          std::cout << "Error in " << i << " output " << *(bufOut + i)
+                    << " != " << ref << " actual max(" << AVec[i] << ", 0.0"
+                    << std::endl;
+          errors++;
+          sticky_errors++;
+        } else {
+          if (verbosity >= 2)
+            std::cout << "Correct " << i << " output " << *(bufOut + i)
+                      << " == " << ref << std::endl;
+        }
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: vector-scalar results not verified."
+                  << std::endl;
+    }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+
+    if (trace_size > 0) {
+      write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size,
+                      vm["trace_file"].as<std::string>());
+    }
+
+    if (VERIFY) {
+      if (!errors) {
+        std::cout << iter << ": pass!\n";
+      } else {
+        std::cout << iter << ": fail! " << errors << " errors\n";
+      }
+    }
+  }
+
+  std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
+  std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
+
+  // Let's figure out how many cycles it takes a core to do a single e^x
+  // There are 4 cores, so the total number of e^x's it does is one quarter of
+  // the test size
+
+  int per_core_calcs = IN_SIZE / 4;
+  float avg_npu_time = npu_time_total / num_iter;
+  float avg_npu_clocks =
+      avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS
+  float clocks_per_calc = avg_npu_clocks / per_core_calcs;
+  std::cout << "Clocks per calc " << clocks_per_calc << std::endl;
+
+  // Lets benchmark the CPU
+  float cpu_time_total = 0;
+  float cpu_time_min = 9999999;
+  float cpu_time_max = 0;
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    std::vector<std::bfloat16_t> AVec(IN_SIZE);
+    std::vector<std::bfloat16_t> ResVec(IN_SIZE);
+    for (int i = 0; i < IN_SIZE; i++) {
+      AVec[i] = random_bfloat16_t(4.0, 2.0);
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < IN_SIZE; i++) {
+      ResVec[i] = exp(AVec[i]);
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    float cpu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    cpu_time_total += cpu_time;
+    cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min;
+    cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max;
+  }
+  std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl;
+  std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl;
+
+  if (VERIFY) {
+    if (!sticky_errors) {
+      std::cout << std::endl << "PASS!" << std::endl << std::endl;
+      return 0;
+    } else {
+      std::cout << std::endl << "FAIL." << std::endl << std::endl;
+      return 1;
+    }
+  } else {
+    std::cout << "Verification skipped, but I'm sure it worked.  I trust in you"
+              << std::endl;
+  }
+  return 0;
+}