diff --git a/aie_kernels/aie2/conv2dk1.cc b/aie_kernels/aie2/conv2dk1.cc
new file mode 100755
index 0000000000..08eb7312e9
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1.cc
@@ -0,0 +1,413 @@
+//===- conv2dk1.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <aie_api/aie.hpp>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#ifdef SCALAR
+
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_i8_scalar(int8_t *input, int8_t *kernels, uint8_t *output,
+                        const int32_t input_width, const int32_t input_channels,
+                        const int32_t output_channels, const int scale) {
+  event0();
+
+  int x, ic, oc, ic8, oc8;
+  // scale=-17;
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (x = 0; x < input_width; x++) { // col of output image
+      for (oc8 = 0; oc8 < 8; oc8++) {
+        int sum = 0;
+        int sum_srs = 0;
+
+        for (ic = 0; ic < input_channels / 8; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            int val = input[(ic * input_width * 8) + (x * 8) + ic8];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+
+        // sum_srs=sum>>scale;
+        sum_srs = (sum + (1 << (scale - 1))) >> scale;
+        sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+        // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+        output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+      }
+    }
+  }
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_ui8_scalar(uint8_t *input, int8_t *kernels, uint8_t *output,
+                         const int32_t input_width,
+                         const int32_t input_channels,
+                         const int32_t output_channels, const int scale) {
+  event0();
+
+  int x, ic, oc, ic8, oc8;
+  // scale=-17;
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (x = 0; x < input_width; x++) { // col of output image
+      for (oc8 = 0; oc8 < 8; oc8++) {
+        int sum = 0;
+        int sum_srs = 0;
+
+        for (ic = 0; ic < input_channels / 8; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            uint8_t val = input[(ic * input_width * 8) + (x * 8) + ic8];
+            int8_t k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                               (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+
+        // sum_srs=sum>>scale;
+        sum_srs = (sum + (1 << (scale - 1))) >> scale;
+        sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+        // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+        output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+      }
+    }
+  }
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: int8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_i8_vector(int8_t *input, int8_t *kernels, uint8_t *output,
+                        const int32_t input_width, const int32_t input_channels,
+                        const int32_t output_channels, const int scale) {
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t *restrict out_ptr = output;
+
+  const int scaleT = scale;
+
+  MMUL4x8x8 acc_tmp[8];
+  for (int x = 0; x < 8; x++) {
+    acc_tmp[x] = aie::zeros<acc32, 32>();
+  }
+
+  // TODO Keeping this variable gives a wrong behavior and bad schedule!
+  const int iw = input_width;
+  const int iw_32 = (input_width / 4) / 8;
+
+  // const int iw_32_rem = (input_width / 4) % 8;
+  // const int iw_32_rem = (32 / 4) % 8;
+  assert((input_width / 4) % 8 == 0);
+  const int iw_32_rem = 0; // TODO - See restriction
+
+  assert((input_channels / 8) > 2); // Assume IC >= 16
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+        for (int ic = 0; ic < (input_channels / 8); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x = 0; x < 8; x++) {
+              aie::vector<int8, 32> in_a = aie::load_v<32>(input);
+              input += 32; // act oc0..3(ic0..7)
+              acc_tmp[x].mac(in_a, in_b);
+            }
+            input += (iw * 8) - 256; // Move to next ic/8 position
+          }
+        // input ptr just moves to next section
+        for (int xx = 0; xx < 8; xx++) {
+          aie::vector<uint8, 32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+          acc_tmp[xx] = aie::zeros<acc32, 32>();
+        }
+        input -= ((input_channels / 8) * iw * 8) -
+                 256; // reset to next input_width/32 block
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+      }
+      input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32 > 0) {
+
+  if (iw_32_rem > 0) {
+
+    const int ocs = output_channels;
+    const int ics = input_channels;
+
+    for (int oc = 0; oc < (ocs / 8); oc++) {
+      for (int ic = 0; ic < (ics / 8); ic++)
+        chess_prepare_for_pipelining chess_loop_range(2, ) {
+          aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+          kernels += 64; // wts ic0..7(oc0..7)
+
+          for (int x = 0; x < iw_32_rem; x++) {
+            aie::vector<int8, 32> in_a = aie::load_v<32>(input);
+            input += 32; // act oc0..3(ic0..7)
+            acc_tmp[x].mac(in_a, in_b);
+          }
+          input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+        }
+      // input ptr just moves to next section
+      for (int xx = 0; xx < iw_32_rem; xx++) {
+        aie::vector<uint8, 32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+        aie::store_v(out_ptr, o1);
+        out_ptr += 32;
+        acc_tmp[xx] = aie::zeros<acc32, 32>();
+      }
+      // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+      // input ptr for remainder
+      input -= 448; // reset to beginning of input ptr for remainder
+      // kernel ptr already at next oc/8
+      out_ptr += (iw * 8) -
+                 (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: uint8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_ui8_vector(uint8_t *input, int8_t *kernels, uint8_t *output,
+                         const int32_t input_width,
+                         const int32_t input_channels,
+                         const int32_t output_channels, const int scale) {
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t *restrict out_ptr = output;
+
+  const int scaleT = scale;
+
+  MMUL4x8x8 acc_tmp[8];
+  for (int x = 0; x < 8; x++) {
+    acc_tmp[x] = aie::zeros<acc32, 32>();
+  }
+
+  // TODO Keeping this variable gives a wrong behavior and bad schedule!
+  const int iw = input_width;
+  const int iw_32 = (input_width / 4) / 8;
+
+  // const int iw_32_rem = (input_width / 4) % 8;
+  // const int iw_32_rem = (32 / 4) % 8;
+  assert((input_width / 4) % 8 == 0);
+  const int iw_32_rem = 0; // TODO - See restriction
+
+  assert((input_channels / 8) > 2); // Assume IC >= 16
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+        for (int ic = 0; ic < (input_channels / 8); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x = 0; x < 8; x++) {
+              aie::vector<uint8, 32> in_a = aie::load_v<32>(input);
+              input += 32; // act oc0..3(ic0..7)
+              acc_tmp[x].mac(in_a, in_b);
+            }
+            input += (iw * 8) - 256; // Move to next ic/8 position
+          }
+        // input ptr just moves to next section
+        for (int xx = 0; xx < 8; xx++) {
+          aie::vector<uint8, 32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+          acc_tmp[xx] = aie::zeros<acc32, 32>();
+        }
+        input -= ((input_channels / 8) * iw * 8) -
+                 256; // reset to next input_width/32 block
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+      }
+      input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32 > 0) {
+
+  if (iw_32_rem > 0) {
+
+    const int ocs = output_channels;
+    const int ics = input_channels;
+
+    for (int oc = 0; oc < (ocs / 8); oc++) {
+      for (int ic = 0; ic < (ics / 8); ic++)
+        chess_prepare_for_pipelining chess_loop_range(2, ) {
+          aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+          kernels += 64; // wts ic0..7(oc0..7)
+
+          for (int x = 0; x < iw_32_rem; x++) {
+            aie::vector<uint8, 32> in_a = aie::load_v<32>(input);
+            input += 32; // act oc0..3(ic0..7)
+            acc_tmp[x].mac(in_a, in_b);
+          }
+          input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+        }
+      // input ptr just moves to next section
+      for (int xx = 0; xx < iw_32_rem; xx++) {
+        aie::vector<uint8, 32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+        aie::store_v(out_ptr, o1);
+        out_ptr += 32;
+        acc_tmp[xx] = aie::zeros<acc32, 32>();
+      }
+      // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+      // input ptr for remainder
+      input -= 448; // reset to beginning of input ptr for remainder
+      // kernel ptr already at next oc/8
+      out_ptr += (iw * 8) -
+                 (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale) {
+  conv2dk1_i8_scalar(input, kernels, output, input_width, input_channels,
+                     output_channels, scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+                  const int32_t input_width, const int32_t input_channels,
+                  const int32_t output_channels, const int scale) {
+  conv2dk1_ui8_scalar(input, kernels, output, input_width, input_channels,
+                      output_channels, scale);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale) {
+  conv2dk1_i8_vector(input, kernels, output, input_width, input_channels,
+                     output_channels, scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+                  const int32_t input_width, const int32_t input_channels,
+                  const int32_t output_channels, const int scale) {
+  conv2dk1_ui8_vector(input, kernels, output, input_width, input_channels,
+                      output_channels, scale);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1.h b/aie_kernels/aie2/conv2dk1.h
new file mode 100755
index 0000000000..d3c405435e
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1.h
@@ -0,0 +1,25 @@
+//===- conv2dk1.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_H
+#define _CONV2DK1_H
+
+extern "C" {
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale);
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+                  const int32_t input_width, const int32_t input_channels,
+                  const int32_t output_channels, const int scale);
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk1_i8.cc b/aie_kernels/aie2/conv2dk1_i8.cc
new file mode 100644
index 0000000000..73a9d8ed12
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_i8.cc
@@ -0,0 +1,224 @@
+//===- conv2dk1.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <aie_api/aie.hpp>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#ifdef SCALAR
+
+const int32_t SMAX = 127;
+const int32_t SMIN = 128;
+
+#ifdef INT8_ACT
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: int8, wts: int8, out: int8
+//*****************************************************************************
+void conv2dk1_i8_scalar(int8_t *input, int8_t *kernels, int8_t *output,
+                        const int32_t input_width, const int32_t input_channels,
+                        const int32_t output_channels, const int scale) {
+  event0();
+
+  int x, ic, oc, ic8, oc8;
+  // scale=-17;
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (x = 0; x < input_width; x++) { // col of output image
+      for (oc8 = 0; oc8 < 8; oc8++) {
+        int sum = 0;
+        int sum_srs = 0;
+
+        for (ic = 0; ic < input_channels / 8; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            int val = input[(ic * input_width * 8) + (x * 8) + ic8];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+
+        // sum_srs=sum>>scale;
+        sum_srs = (sum + (1 << (scale - 1))) >> scale;
+        sum_srs = (sum_srs > SMAX) ? SMAX : (sum_srs < -SMIN) ? -SMIN : sum_srs;
+        // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+        output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+      }
+    }
+  }
+
+  event1();
+}
+#endif // INT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: int8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_i8_vector(int8_t *input, int8_t *kernels, int8_t *output,
+                        const int32_t input_width, const int32_t input_channels,
+                        const int32_t output_channels, const int scale) {
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(aie::rounding_mode::symmetric_inf); // Needed to saturate
+                                                          // properly to uint8
+
+  int8_t *restrict out_ptr = output;
+
+  const int scaleT = scale;
+
+  MMUL4x8x8 acc_tmp[8];
+  for (int x = 0; x < 8; x++) {
+    acc_tmp[x] = aie::zeros<acc32, 32>();
+  }
+
+  // TODO Keeping this variable gives a wrong behavior and bad schedule!
+  const int iw = input_width;
+  const int iw_32 = (input_width / 4) / 8;
+
+  // const int iw_32_rem = (input_width / 4) % 8;
+  // const int iw_32_rem = (32 / 4) % 8;
+  assert((input_width / 4) % 8 == 0);
+  const int iw_32_rem = 0; // TODO - See restriction
+
+  assert((input_channels / 8) > 2); // Assume IC >= 16
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+        for (int ic = 0; ic < (input_channels / 8); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x = 0; x < 8; x++) {
+              aie::vector<int8, 32> in_a = aie::load_v<32>(input);
+              input += 32; // act oc0..3(ic0..7)
+              acc_tmp[x].mac(in_a, in_b);
+            }
+            input += (iw * 8) - 256; // Move to next ic/8 position
+          }
+        // input ptr just moves to next section
+        for (int xx = 0; xx < 8; xx++) {
+          aie::vector<int8, 32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+          acc_tmp[xx] = aie::zeros<acc32, 32>();
+        }
+        input -= ((input_channels / 8) * iw * 8) -
+                 256; // reset to next input_width/32 block
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+      }
+      input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32 > 0) {
+
+  if (iw_32_rem > 0) {
+
+    const int ocs = output_channels;
+    const int ics = input_channels;
+
+    for (int oc = 0; oc < (ocs / 8); oc++) {
+      for (int ic = 0; ic < (ics / 8); ic++)
+        chess_prepare_for_pipelining chess_loop_range(2, ) {
+          aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+          kernels += 64; // wts ic0..7(oc0..7)
+
+          for (int x = 0; x < iw_32_rem; x++) {
+            aie::vector<int8, 32> in_a = aie::load_v<32>(input);
+            input += 32; // act oc0..3(ic0..7)
+            acc_tmp[x].mac(in_a, in_b);
+          }
+          input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+        }
+      // input ptr just moves to next section
+      for (int xx = 0; xx < iw_32_rem; xx++) {
+        aie::vector<int8, 32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+        aie::store_v(out_ptr, o1);
+        out_ptr += 32;
+        acc_tmp[xx] = aie::zeros<acc32, 32>();
+      }
+      // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+      // input ptr for remainder
+      input -= 448; // reset to beginning of input ptr for remainder
+      // kernel ptr already at next oc/8
+      out_ptr += (iw * 8) -
+                 (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }
+
+  } // if(iw_32_rem > 0)
+
+  event1();
+}
+#endif // INT8_ACT
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale) {
+  conv2dk1_i8_scalar(input, kernels, output, input_width, input_channels,
+                     output_channels, scale);
+}
+#endif // INT8_ACT
+#else  // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale) {
+  conv2dk1_i8_vector(input, kernels, output, input_width, input_channels,
+                     output_channels, scale);
+}
+#endif // INT8_ACT
+#endif // Vector
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_i8.h b/aie_kernels/aie2/conv2dk1_i8.h
new file mode 100644
index 0000000000..98925f8a86
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_i8.h
@@ -0,0 +1,22 @@
+//===- conv2dk1.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_H
+#define _CONV2DK1_H
+
+extern "C" {
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+                 const int32_t input_width, const int32_t input_channels,
+                 const int32_t output_channels, const int scale);
+
+} // extern "C"
+
+#endif
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip.cc b/aie_kernels/aie2/conv2dk1_skip.cc
new file mode 100755
index 0000000000..feaa95333b
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip.cc
@@ -0,0 +1,766 @@
+//===- conv2dk1_skip.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+#ifdef SCALAR
+
+const int32_t MIN = 128;
+const int32_t MAX = 127;
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - scalar
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_i8_scalar(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                             uint8_t *output, int8_t *skip,
+                             const int32_t input_width,
+                             const int32_t input_channels,
+                             const int32_t output_channels, const int scale,
+                             const int skip_scale) {
+  event0();
+
+  int x, ic, ic2, oc, oc8, ic8, ic8b;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  // const int scaleT = 10;
+  // const int skip_scaleT = 0;
+
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (oc8 = 0; oc8 < 8; oc8++) {
+      for (x = 0; x < input_width; x++) { // col of output image
+        int sum = 0;
+        int sum_srs = 0;
+        int64_t skip_sum = 0;
+        int skip_sum_srs_final = 0;
+        int skip_sum_srs_final_out = 0;
+        int skip_temp = 0;
+        for (ic = 0; ic < input_channels / 16; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            // int val = input0[ic * input_width + x];
+            int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+            // int k = kernels[oc * input_channels + ic];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+        // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+        for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+          for (ic8b = 0; ic8b < 8; ic8b++) {
+            // int val2 = input1[ic2 * input_width + x];
+            int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+                              ic8b]; // TODO ic2 should be shifted?
+            // int k2 = kernels[oc * input_channels + ic2];
+            int k2 = kernels[(oc * (input_channels / 8) * 64) +
+                             ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+                             oc8];
+            sum += val2 * k2;
+          }
+        }
+        // scale for convolution
+        sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+        sum_srs = (sum_srs > MAX)    ? MAX
+                  : (sum_srs < -MIN) ? -MIN
+                                     : sum_srs; // clip
+        // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+        // //clip
+
+        // scale for residual
+        // skip_temp=skip[oc * input_width + x];
+        skip_temp = skip[(oc * input_width * 8) + (x * 8) + oc8];
+        skip_sum = sum_srs + skip_temp;
+        // skip_sum= sum_srs;
+
+        skip_sum_srs_final =
+            (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+        skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+                                 : (skip_sum_srs_final < 0)
+                                     ? 0
+                                     : skip_sum_srs_final; // clip
+
+        // output[oc * input_width + x] = skip_sum_srs_final_out;
+        output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+        // output[oc * input_width + x] = sum;
+        // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+      }
+    }
+  }
+
+  // for (oc = 0; oc < output_channels; ++oc) {
+  //         for (x = 0; x < input_width; ++x) {
+  //             output[oc * input_width + x]=skip[oc * input_width + x];}
+  // }
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - scalar
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_ui8_scalar(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                              uint8_t *output, uint8_t *skip,
+                              const int32_t input_width,
+                              const int32_t input_channels,
+                              const int32_t output_channels, const int scale,
+                              const int skip_scale) {
+  event0();
+
+  int x, ic, ic2, oc, oc8, ic8, ic8b;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  // const int scaleT = 10;
+  // const int skip_scaleT = 0;
+
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (oc8 = 0; oc8 < 8; oc8++) {
+      for (x = 0; x < input_width; x++) { // col of output image
+        int sum = 0;
+        int sum_srs = 0;
+        int skip_sum = 0;
+        int skip_sum_srs_final = 0;
+        int skip_sum_srs_final_out = 0;
+        uint8_t skip_temp = 0;
+        for (ic = 0; ic < input_channels / 16; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            // int val = input0[ic * input_width + x];
+            uint8_t val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+            // int k = kernels[oc * input_channels + ic];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+        for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+          for (ic8b = 0; ic8b < 8; ic8b++) {
+            // int val2 = input1[ic2 * input_width + x];
+            uint8_t val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+                                  ic8b]; // TODO ic2 should be shifted?
+            // int k2 = kernels[oc * input_channels + ic2];
+            int k2 = kernels[(oc * (input_channels / 8) * 64) +
+                             ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+                             oc8];
+            sum += val2 * k2;
+          }
+        }
+        // scale for convolution
+        sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+        sum_srs = (sum_srs > MAX)    ? MAX
+                  : (sum_srs < -MIN) ? -MIN
+                                     : sum_srs; // clip
+
+        // scale for residual
+        skip_temp = skip[(oc * input_width * 8) + (x * 8) + oc8];
+        skip_sum = sum_srs + skip_temp;
+
+        // skip_sum= sum_srs;
+
+        skip_sum_srs_final =
+            (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+        skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+                                 : (skip_sum_srs_final < 0)
+                                     ? 0
+                                     : skip_sum_srs_final; // clip
+
+        // output[oc * input_width + x] = skip_sum_srs_final_out;
+        output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+        // output[oc * input_width + x] = sum;
+        // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+      }
+    }
+  }
+
+  // for (oc = 0; oc < output_channels; ++oc) {
+  //         for (x = 0; x < input_width; ++x) {
+  //             output[oc * input_width + x]=skip[oc * input_width + x];}
+  // }
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - vector
+// act: uint8, wts: int8, skip: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_skip_i8_vector(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                             uint8_t *output, int8_t *skip,
+                             const int32_t input_width,
+                             const int32_t input_channels,
+                             const int32_t output_channels, const int scale,
+                             const int skip_scale) {
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t *restrict out_ptr = output;
+  int8_t *i_out_ptr = (int8_t *)output;
+  int8_t *restrict skip_ptr = skip;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+
+  constexpr int NUM_ACC = 8;
+
+  const int iw_32 = (input_width / 4) / 8;
+  const int iw = input_width;
+  // const int iw_32_rem = (input_width / 4) % 8;
+  assert((input_width / 4) % 8 == 0);
+  const int iw_32_rem = 0; // TODO - See restriction
+
+  assert((input_channels / 8) > 2); // Assume IC >= 16
+
+  int input_offset1 = 0;
+  int input_offset2 = 0;
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int x = 0; x < iw_32; x++) {
+        MMUL4x8x8 acc_tmp[NUM_ACC];
+        for (int i = 0; i < NUM_ACC; i++) {
+          acc_tmp[i] = aie::zeros<acc32, 32>();
+        }
+        for (int ic = 0; ic < (input_channels / 16); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input0 + input_offset1);
+              input_offset1 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset1 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+        for (int ic = 0; ic < (input_channels / 16); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input1 + input_offset2);
+              input_offset2 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset2 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+        // input ptr just moves to next section
+        for (int x8 = 0; x8 < NUM_ACC; x8++) {
+          aie::vector<int8, 32> skip1 = aie::load_v<32>(skip_ptr);
+          skip_ptr += 32;
+
+          aie::accum<acc32, 32> accj;
+          accj.from_vector(skip1, 0);
+          accj = aie::add(accj, acc_tmp[x8].to_vector<int8>(scaleT));
+          // accj = aie::mac(accj, acc_tmp[x8].to_vector<int8>(scaleT),
+          // (uint8_t)1);
+          aie::vector<uint8, 32> o1 = accj.to_vector<uint8>(skip_scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+          // acc_tmp[x8] = aie::zeros<acc32,32>();
+        }
+        input_offset1 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset2 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+      }                                // for(int x=0; x<iw_32; x++) {
+      // input_offset -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      input_offset1 = 0;                    // reset beginning of input ptr
+      input_offset2 = 0;                    // reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }                  // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    out_ptr -= output_channels *
+               iw; // output_channels/8*iw_32*8*32 = 256/8*(iw/4/8)*8*32
+
+    // for(int oc=0; oc<(output_channels/8); oc++) {
+    //     for(int x=0; x<iw_32; x++) {
+    //         for(int x8=0; x8<NUM_ACC; x8++) {
+    //             // aie::vector<uint8,32> skip1 = aie::load_v<32>(skip_ptr);
+    //             skip_ptr += 32; aie::vector<int8,32> skip1 =
+    //             aie::load_v<32>(skip_ptr); skip_ptr += 32;
+    //             // aie::vector<uint8,32> tmp   = aie::load_v<32>(out_ptr);
+    //             aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+    //             i_out_ptr += 32; aie::accum<acc32,32> accj;
+    //             accj.from_vector(skip1,0);
+    //             accj = aie::mac(accj, tmp, (uint8_t)1);
+    //             aie::vector<uint8,32> o3 =
+    //             accj.to_vector<uint8>(skip_scaleT); aie::store_v(out_ptr,
+    //             o3); out_ptr += 32;
+    //         }
+    //     }
+    //     out_ptr += (iw_32_rem*32);
+    //     skip_ptr += (iw_32_rem*32);
+    // }
+
+    out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+    skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+  } // if(iw_32 > 0) {
+
+  // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+  // if(iw_32_rem > 0) {
+
+  // const int ocs = output_channels;
+  // const int ics = input_channels;
+
+  // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+  // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+  // for(int oc=0; oc<(ocs/8); oc++) {
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position, TODO -(iw_32_rem*8)??
+  //     }
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position
+  //     }
+  //     // input ptr just moves to next section
+  //     for(int xx=0; xx<iw_32_rem; xx++) {
+  //         // aie::vector<uint8,32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+  //         aie::vector<int8,32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+  //         // aie::store_v(out_ptr, o1); out_ptr += 32;
+  //         aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+  //         acc_tmp[xx] = aie::zeros<acc32,32>();
+  //     }
+  //     // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+  //     of input ptr for remainder input_offset1   -= 448; // reset to
+  //     beginning of input ptr for remainder input_offset2   -= 448; // reset
+  //     to beginning of input ptr for remainder
+  //     // kernel ptr already at next oc/8
+  //     i_out_ptr += (iw*8)-(iw_32_rem*32);           // move to next oc/8
+  //     (skip remainder section if present)
+  // }
+
+  // i_out_ptr -= output_channels*iw;
+
+  // for(int oc=0; oc<(output_channels/8); oc++) {
+  //     for(int x8=0; x8<NUM_ACC; x8++) {
+  //         aie::vector<int8,32> skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+  //         32; aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+  //         aie::accum<acc32,32> accj;
+  //         accj.from_vector(skip1,0);
+  //         accj = aie::mac(accj, tmp, (uint8_t)1);
+  //         aie::vector<uint8,32> o3 = accj.to_vector<uint8>(skip_scaleT);
+  //         aie::store_v(out_ptr, o3); out_ptr += 32;
+  //     }
+  //     out_ptr += (iw*8)-(iw_32_rem*32);
+  //     skip_ptr += (iw*8)-(iw_32_rem*32);
+  // }
+
+  // } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - vector
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_skip_ui8_vector(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                              uint8_t *output, uint8_t *skip,
+                              const int32_t input_width,
+                              const int32_t input_channels,
+                              const int32_t output_channels, const int scale,
+                              const int skip_scale) {
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t *restrict out_ptr = output;
+  int8_t *i_out_ptr = (int8_t *)output;
+  uint8_t *restrict skip_ptr = skip;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+
+  constexpr int NUM_ACC = 8;
+
+  const int iw_32 = (input_width / 4) / 8;
+  const int iw = input_width;
+  // const int iw_32_rem = (input_width / 4) % 8;
+  assert((input_width / 4) % 8 == 0);
+  const int iw_32_rem = 0; // TODO - See restriction
+
+  assert((input_channels / 8) > 2); // Assume IC >= 16
+
+  int input_offset1 = 0;
+  int input_offset2 = 0;
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int x = 0; x < iw_32; x++) {
+        MMUL4x8x8 acc_tmp[NUM_ACC];
+        for (int i = 0; i < NUM_ACC; i++) {
+          acc_tmp[i] = aie::zeros<acc32, 32>();
+        }
+        for (int ic = 0; ic < (input_channels / 16); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input0 + input_offset1);
+              input_offset1 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset1 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+        for (int ic = 0; ic < (input_channels / 16); ic++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input1 + input_offset2);
+              input_offset2 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset2 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+        // input ptr just moves to next section
+        for (int x8 = 0; x8 < NUM_ACC; x8++) {
+          aie::vector<uint8, 32> skip1 = aie::load_v<32>(skip_ptr);
+          skip_ptr += 32;
+
+          aie::accum<acc32, 32> accj;
+          accj.from_vector(skip1, 0);
+          accj = aie::add(accj, acc_tmp[x8].to_vector<int8>(scaleT));
+          // accj = aie::mac(accj, acc_tmp[x8].to_vector<int8>(scaleT),
+          // (uint8_t)1);
+          aie::vector<uint8, 32> o1 = accj.to_vector<uint8>(skip_scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+          // acc_tmp[x8] = aie::zeros<acc32,32>();
+        }
+        input_offset1 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset2 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+      }                                // for(int x=0; x<iw_32; x++) {
+      // input_offset -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      input_offset1 = 0;                    // reset beginning of input ptr
+      input_offset2 = 0;                    // reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }                  // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    out_ptr -= output_channels *
+               iw; // output_channels/8*iw_32*8*32 = 256/8*(iw/4/8)*8*32
+
+    // for(int oc=0; oc<(output_channels/8); oc++) {
+    //     for(int x=0; x<iw_32; x++) {
+    //         for(int x8=0; x8<NUM_ACC; x8++) {
+    //             // aie::vector<uint8,32> skip1 = aie::load_v<32>(skip_ptr);
+    //             skip_ptr += 32; aie::vector<int8,32> skip1 =
+    //             aie::load_v<32>(skip_ptr); skip_ptr += 32;
+    //             // aie::vector<uint8,32> tmp   = aie::load_v<32>(out_ptr);
+    //             aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+    //             i_out_ptr += 32; aie::accum<acc32,32> accj;
+    //             accj.from_vector(skip1,0);
+    //             accj = aie::mac(accj, tmp, (uint8_t)1);
+    //             aie::vector<uint8,32> o3 =
+    //             accj.to_vector<uint8>(skip_scaleT); aie::store_v(out_ptr,
+    //             o3); out_ptr += 32;
+    //         }
+    //     }
+    //     out_ptr += (iw_32_rem*32);
+    //     skip_ptr += (iw_32_rem*32);
+    // }
+
+    out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+    skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+  } // if(iw_32 > 0) {
+
+  // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+  // if(iw_32_rem > 0) {
+
+  // const int ocs = output_channels;
+  // const int ics = input_channels;
+
+  // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+  // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+  // for(int oc=0; oc<(ocs/8); oc++) {
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position, TODO -(iw_32_rem*8)??
+  //     }
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position
+  //     }
+  //     // input ptr just moves to next section
+  //     for(int xx=0; xx<iw_32_rem; xx++) {
+  //         // aie::vector<uint8,32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+  //         aie::vector<int8,32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+  //         // aie::store_v(out_ptr, o1); out_ptr += 32;
+  //         aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+  //         acc_tmp[xx] = aie::zeros<acc32,32>();
+  //     }
+  //     // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+  //     of input ptr for remainder input_offset1   -= 448; // reset to
+  //     beginning of input ptr for remainder input_offset2   -= 448; // reset
+  //     to beginning of input ptr for remainder
+  //     // kernel ptr already at next oc/8
+  //     i_out_ptr += (iw*8)-(iw_32_rem*32);           // move to next oc/8
+  //     (skip remainder section if present)
+  // }
+
+  // i_out_ptr -= output_channels*iw;
+
+  // for(int oc=0; oc<(output_channels/8); oc++) {
+  //     for(int x8=0; x8<NUM_ACC; x8++) {
+  //         aie::vector<int8,32> skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+  //         32; aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+  //         aie::accum<acc32,32> accj;
+  //         accj.from_vector(skip1,0);
+  //         accj = aie::mac(accj, tmp, (uint8_t)1);
+  //         aie::vector<uint8,32> o3 = accj.to_vector<uint8>(skip_scaleT);
+  //         aie::store_v(out_ptr, o3); out_ptr += 32;
+  //     }
+  //     out_ptr += (iw*8)-(iw_32_rem*32);
+  //     skip_ptr += (iw*8)-(iw_32_rem*32);
+  // }
+
+  // } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 skip wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                      uint8_t *output, int8_t *skip, const int32_t input_width,
+                      const int32_t input_channels,
+                      const int32_t output_channels, const int scale,
+                      const int skip_scale) {
+  conv2dk1_skip_i8_scalar(input0, input1, kernels, output, skip, input_width,
+                          input_channels, output_channels, scale, skip_scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                       uint8_t *output, uint8_t *skip,
+                       const int32_t input_width, const int32_t input_channels,
+                       const int32_t output_channels, const int scale,
+                       const int skip_scale) {
+  conv2dk1_skip_ui8_scalar(input0, input1, kernels, output, skip, input_width,
+                           input_channels, output_channels, scale, skip_scale);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                      uint8_t *output, int8_t *skip, const int32_t input_width,
+                      const int32_t input_channels,
+                      const int32_t output_channels, const int scale,
+                      const int skip_scale) {
+  conv2dk1_skip_i8_vector(input0, input1, kernels, output, skip, input_width,
+                          input_channels, output_channels, scale, skip_scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                       uint8_t *output, uint8_t *skip,
+                       const int32_t input_width, const int32_t input_channels,
+                       const int32_t output_channels, const int scale,
+                       const int skip_scale) {
+  conv2dk1_skip_ui8_vector(input0, input1, kernels, output, skip, input_width,
+                           input_channels, output_channels, scale, skip_scale);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip.h b/aie_kernels/aie2/conv2dk1_skip.h
new file mode 100755
index 0000000000..8daa62e507
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip.h
@@ -0,0 +1,31 @@
+//===- conv2dk1_skip.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_SKIP_H
+#define _CONV2DK1_SKIP_H
+
+extern "C" {
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                      uint8_t *output, int8_t *skip, const int32_t input_width,
+                      const int32_t input_channels,
+                      const int32_t output_channels, const int scale,
+                      const int skip_scale);
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                       uint8_t *output, uint8_t *skip,
+                       const int32_t input_width, const int32_t input_channels,
+                       const int32_t output_channels, const int scale,
+                       const int skip_scale);
+
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk1_skip_init.cc b/aie_kernels/aie2/conv2dk1_skip_init.cc
new file mode 100755
index 0000000000..591377479f
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip_init.cc
@@ -0,0 +1,934 @@
+//===- conv2dk1_skip_init.cc -------------------------------------------------*-
+// C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+#ifdef SCALAR
+
+const int32_t MIN = 128;
+const int32_t MAX = 127;
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - scalar
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+// NOTE: Assumes input_channels >= 16
+void conv2dk1_skip_init_i8_scalar(
+    uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+    int8_t *skip, const int32_t input_width, const int32_t input_channels,
+    const int32_t output_channels, const int32_t input_channels_skip,
+    const int scale, const int skip_scale, const int scale_skip_conv) {
+  event0();
+
+  int x, ic, ic2, ic3, oc, oc8, ic8, ic8b, ic8c;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  const int skip_scaleT_conv = scale_skip_conv;
+  const int wts_offset = output_channels * input_channels;
+
+  // const int scaleT = 10;
+  // const int skip_scaleT = 0;
+
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (oc8 = 0; oc8 < 8; oc8++) {
+      for (x = 0; x < input_width; x++) { // col of output image
+        int sum = 0;
+        int sum_srs = 0;
+        int sum_skip_conv = 0;
+        int sum_skip_conv_srs = 0;
+        int64_t skip_sum = 0;
+        int skip_sum_srs_final = 0;
+        int skip_sum_srs_final_out = 0;
+        int skip_temp = 0;
+        for (ic = 0; ic < input_channels / 16; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            // int val = input0[ic * input_width + x];
+            int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+            // int k = kernels[oc * input_channels + ic];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+        // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+        for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+          for (ic8b = 0; ic8b < 8; ic8b++) {
+            // int val2 = input1[ic2 * input_width + x];
+            int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+                              ic8b]; // TODO ic2 should be shifted?
+            // int k2 = kernels[oc * input_channels + ic2];
+            int k2 = kernels[(oc * (input_channels / 8) * 64) +
+                             ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+                             oc8];
+            sum += val2 * k2;
+          }
+        }
+        // scale for convolution
+        sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+        sum_srs = (sum_srs > MAX)    ? MAX
+                  : (sum_srs < -MIN) ? -MIN
+                                     : sum_srs; // clip
+        // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+        // //clip
+        //  ********************************************************************************************************************
+        // skip convolution
+        for (ic3 = 0; ic3 < input_channels_skip / 8; ic3++) {
+          for (ic8c = 0; ic8c < 8; ic8c++) {
+            int val3 = skip[(ic3 * input_width * 8) + (x * 8) + ic8c];
+            int k3 = kernels[(oc * (input_channels_skip / 8) * 64) +
+                             (ic3 * 64) + (ic8c * 8) + oc8 + wts_offset];
+            sum_skip_conv += val3 * k3;
+          }
+        }
+        sum_skip_conv_srs =
+            (sum_skip_conv + (1 << (skip_scaleT_conv - 1))) >> skip_scaleT_conv;
+        sum_skip_conv_srs = (sum_skip_conv_srs > MAX)    ? MAX
+                            : (sum_skip_conv_srs < -MIN) ? -MIN
+                                                         : sum_skip_conv_srs;
+        //  ********************************************************************************************************************
+        // scale for residual
+        // skip_temp=skip[oc * input_width + x];
+        // skip_temp=skip[(oc*input_width*8) + (x*8) + oc8] ;
+        skip_temp = sum_skip_conv_srs;
+        skip_sum = sum_srs + skip_temp;
+        skip_sum_srs_final =
+            (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+        skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+                                 : (skip_sum_srs_final < 0)
+                                     ? 0
+                                     : skip_sum_srs_final; // clip
+
+        // output[oc * input_width + x] = skip_sum_srs_final_out;
+        output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+        // output[oc * input_width + x] = sum;
+        // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+      }
+    }
+  }
+
+  // for (oc = 0; oc < output_channels; ++oc) {
+  //         for (x = 0; x < input_width; ++x) {
+  //             output[oc * input_width + x]=skip[oc * input_width + x];}
+  // }
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - scalar
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//
+// NOTE: TODO Currently just a copy of the i8 code. No real differences
+//*****************************************************************************
+void conv2dk1_skip_init_ui8_scalar(
+    uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+    uint8_t *skip, const int32_t input_width, const int32_t input_channels,
+    const int32_t output_channels, const int32_t input_channels_skip,
+    const int scale, const int skip_scale, const int scale_skip_conv) {
+  event0();
+
+  int x, ic, ic2, ic3, oc, oc8, ic8, ic8b, ic8c;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  const int skip_scaleT_conv = scale_skip_conv;
+  const int wts_offset = output_channels * input_channels;
+
+  // const int scaleT = 10;
+  // const int skip_scaleT = 0;
+
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    for (oc8 = 0; oc8 < 8; oc8++) {
+      for (x = 0; x < input_width; x++) { // col of output image
+        int sum = 0;
+        int sum_srs = 0;
+        int sum_skip_conv = 0;
+        int sum_skip_conv_srs = 0;
+        int64_t skip_sum = 0;
+        int skip_sum_srs_final = 0;
+        int skip_sum_srs_final_out = 0;
+        int skip_temp = 0;
+        for (ic = 0; ic < input_channels / 16; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            // int val = input0[ic * input_width + x];
+            int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+            // int k = kernels[oc * input_channels + ic];
+            int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+                            (ic8 * 8) + oc8];
+            sum += val * k;
+          }
+        }
+        // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+        for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+          for (ic8b = 0; ic8b < 8; ic8b++) {
+            // int val2 = input1[ic2 * input_width + x];
+            int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+                              ic8b]; // TODO ic2 should be shifted?
+            // int k2 = kernels[oc * input_channels + ic2];
+            int k2 = kernels[(oc * (input_channels / 8) * 64) +
+                             ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+                             oc8];
+            sum += val2 * k2;
+          }
+        }
+        // scale for convolution
+        sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+        sum_srs = (sum_srs > MAX)    ? MAX
+                  : (sum_srs < -MIN) ? -MIN
+                                     : sum_srs; // clip
+        // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+        // //clip
+        //  ********************************************************************************************************************
+        // skip convolution
+        for (ic3 = 0; ic3 < input_channels_skip / 8; ic3++) {
+          for (ic8c = 0; ic8c < 8; ic8c++) {
+            int val3 = skip[(ic3 * input_width * 8) + (x * 8) + ic8c];
+            int k3 = kernels[(oc * (input_channels_skip / 8) * 64) +
+                             (ic3 * 64) + (ic8c * 8) + oc8 + wts_offset];
+            sum_skip_conv += val3 * k3;
+          }
+        }
+        sum_skip_conv_srs =
+            (sum_skip_conv + (1 << (skip_scaleT_conv - 1))) >> skip_scaleT_conv;
+        sum_skip_conv_srs = (sum_skip_conv_srs > MAX)    ? MAX
+                            : (sum_skip_conv_srs < -MIN) ? -MIN
+                                                         : sum_skip_conv_srs;
+        //  ********************************************************************************************************************
+        // scale for residual
+        // skip_temp=skip[oc * input_width + x];
+        // skip_temp=skip[(oc*input_width*8) + (x*8) + oc8] ;
+        skip_temp = sum_skip_conv_srs;
+        skip_sum = sum_srs + skip_temp;
+        skip_sum_srs_final =
+            (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+        skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+                                 : (skip_sum_srs_final < 0)
+                                     ? 0
+                                     : skip_sum_srs_final; // clip
+
+        // output[oc * input_width + x] = skip_sum_srs_final_out;
+        output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+        // output[oc * input_width + x] = sum;
+        // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+      }
+    }
+  }
+
+  // for (oc = 0; oc < output_channels; ++oc) {
+  //         for (x = 0; x < input_width; ++x) {
+  //             output[oc * input_width + x]=skip[oc * input_width + x];}
+  // }
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - vector
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_init_i8_vector(
+    uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+    int8_t *skip, const int32_t input_width, const int32_t input_channels,
+    const int32_t output_channels, const int32_t input_channels_skip,
+    const int scale, const int skip_scale, const int scale_skip_conv)
+
+{
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  using MMULi4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t * /*restrict*/ out_ptr = output;
+  int8_t *i_out_ptr = (int8_t *)output;
+  // uint8_t * restrict skip_ptr = skip;
+  int8_t *restrict skip_ptr = skip;
+
+  const int wts_offset = output_channels * input_channels;
+  int8_t *kernels_skip = kernels + wts_offset;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  const int scaleT_skip_conv = scale_skip_conv;
+
+  constexpr int NUM_ACC = 8;
+
+  const int iw_32 = (input_width / 4) / 8;
+  const int iw = input_width;
+  const int iw_32_rem = (input_width / 4) % 8;
+
+  int input_offset1 = 0;
+  int input_offset2 = 0;
+  int input_offset3 = 0;
+
+  // aie::vector<int8,32> vec_tmp[NUM_ACC];
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int x = 0; x < iw_32; x++) {
+        aie::vector<int8, 32> vec_conv[NUM_ACC];
+        aie::vector<int8, 32> vec_skip[NUM_ACC];
+
+        { // conv section
+          MMUL4x8x8 acc_tmp[NUM_ACC];
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            acc_tmp[x8] = aie::zeros<acc32, 32>();
+          }
+
+          for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+            // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+            // (2x 256b loads) For ic > 8, we would load the next 64 weights
+            // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+            // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++)
+            // chess_prepare_for_pipelining //chess_loop_range(7, )
+            // e.g. 28/4 = 7
+            // 13 cycles delay for vload.
+            // 7 gives us 3 cycle inner loop.
+            // 13 gave 1 cycle inner loop before partial load, not it only gets
+            // 2 cycles (not sure why?)
+            {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input0 + input_offset1);
+              input_offset1 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset1 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+            // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+            // (2x 256b loads) For ic > 8, we would load the next 64 weights
+            // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+            // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++)
+            // chess_prepare_for_pipelining //chess_loop_range(7, )
+            // e.g. 28/4 = 7
+            // 13 cycles delay for vload.
+            // 7 gives us 3 cycle inner loop.
+            // 13 gave 1 cycle inner loop before partial load, not it only gets
+            // 2 cycles (not sure why?)
+            {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input1 + input_offset2);
+              input_offset2 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset2 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            vec_conv[x8] = acc_tmp[x8].to_vector<int8>(scaleT);
+          }
+        } // conv section
+
+        { // skip section
+          MMULi4x8x8 acci_tmp[NUM_ACC];
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            acci_tmp[x8] = aie::zeros<acc32, 32>();
+          }
+
+          for (int ic = 0; ic < (input_channels_skip / 8); ic++) {
+            // For oc > 8, we would load the next 64 weights after all the ic
+            // weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels_skip);
+            kernels_skip += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<int8, 32> in_a =
+                  aie::load_v<32>(skip + input_offset3);
+              input_offset3 += 32; // act oc0..3(ic0..7)
+              acci_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset3 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            vec_skip[x8] = acci_tmp[x8].to_vector<int8>(scaleT_skip_conv);
+          }
+        } // skip section
+
+        // input ptr just moves to next section
+        for (int x8 = 0; x8 < NUM_ACC; x8++) {
+          aie::accum<acc32, 32> accj;
+          accj.from_vector(vec_conv[x8], 0);
+          accj = aie::add(accj, vec_skip[x8]);
+          aie::vector<uint8, 32> o1 = accj.to_vector<uint8>(skip_scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+        }
+        input_offset1 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset2 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset3 -=
+            ((input_channels_skip / 8) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+        kernels_skip -= (input_channels_skip / 8) *
+                        64; // reset kernel back to beginning of ic/8
+      }                     // for(int x=0; x<iw_32; x++) {
+      // input_offset -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      input_offset1 = 0;                    // reset beginning of input ptr
+      input_offset2 = 0;                    // reset beginning of input ptr
+      input_offset3 = 0;                    // reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      kernels_skip +=
+          (input_channels_skip / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }                  // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    out_ptr -= output_channels *
+               iw; // output_channels/8*iw_32*8*32 = 256/8*(iw/4/8)*8*32
+
+    // for(int oc=0; oc<(output_channels/8); oc++) {
+    //     for(int x=0; x<iw_32; x++) {
+    //         for(int x8=0; x8<NUM_ACC; x8++) {
+    //             // aie::vector<uint8,32> skip1 = aie::load_v<32>(skip_ptr);
+    //             skip_ptr += 32; aie::vector<int8,32> skip1 =
+    //             aie::load_v<32>(skip_ptr); skip_ptr += 32;
+    //             // aie::vector<uint8,32> tmp   = aie::load_v<32>(out_ptr);
+    //             aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+    //             i_out_ptr += 32; aie::accum<acc32,32> accj;
+    //             accj.from_vector(skip1,0);
+    //             accj = aie::mac(accj, tmp, (uint8_t)1);
+    //             aie::vector<uint8,32> o3 =
+    //             accj.to_vector<uint8>(skip_scaleT); aie::store_v(out_ptr,
+    //             o3); out_ptr += 32;
+    //         }
+    //     }
+    //     out_ptr += (iw_32_rem*32);
+    //     skip_ptr += (iw_32_rem*32);
+    // }
+
+    out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+    skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+  } // if(iw_32 > 0) {
+
+  // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+  // if(iw_32_rem > 0) {
+
+  // const int ocs = output_channels;
+  // const int ics = input_channels;
+
+  // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+  // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+  // for(int oc=0; oc<(ocs/8); oc++) {
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position, TODO -(iw_32_rem*8)??
+  //     }
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position
+  //     }
+  //     // input ptr just moves to next section
+  //     for(int xx=0; xx<iw_32_rem; xx++) {
+  //         // aie::vector<uint8,32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+  //         aie::vector<int8,32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+  //         // aie::store_v(out_ptr, o1); out_ptr += 32;
+  //         aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+  //         acc_tmp[xx] = aie::zeros<acc32,32>();
+  //     }
+  //     // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+  //     of input ptr for remainder input_offset1   -= 448; // reset to
+  //     beginning of input ptr for remainder input_offset2   -= 448; // reset
+  //     to beginning of input ptr for remainder
+  //     // kernel ptr already at next oc/8
+  //     i_out_ptr += (iw*8)-(iw_32_rem*32);           // move to next oc/8
+  //     (skip remainder section if present)
+  // }
+
+  // i_out_ptr -= output_channels*iw;
+
+  // for(int oc=0; oc<(output_channels/8); oc++) {
+  //     for(int x8=0; x8<NUM_ACC; x8++) {
+  //         aie::vector<int8,32> skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+  //         32; aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+  //         aie::accum<acc32,32> accj;
+  //         accj.from_vector(skip1,0);
+  //         accj = aie::mac(accj, tmp, (uint8_t)1);
+  //         aie::vector<uint8,32> o3 = accj.to_vector<uint8>(skip_scaleT);
+  //         aie::store_v(out_ptr, o3); out_ptr += 32;
+  //     }
+  //     out_ptr += (iw*8)-(iw_32_rem*32);
+  //     skip_ptr += (iw*8)-(iw_32_rem*32);
+  // }
+
+  // } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - vector
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_init_ui8_vector(
+    uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+    uint8_t *skip, const int32_t input_width, const int32_t input_channels,
+    const int32_t output_channels, const int32_t input_channels_skip,
+    const int scale, const int skip_scale, const int scale_skip_conv)
+
+{
+  event0();
+
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  // using MMULi4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  uint8_t * /*restrict*/ out_ptr = output;
+  int8_t *i_out_ptr = (int8_t *)output;
+  // uint8_t * restrict skip_ptr = skip;
+  uint8_t *restrict skip_ptr = skip;
+
+  const int wts_offset = output_channels * input_channels;
+  int8_t *kernels_skip = kernels + wts_offset;
+
+  const int scaleT = scale;
+  const int skip_scaleT = skip_scale;
+  const int scaleT_skip_conv = scale_skip_conv;
+
+  constexpr int NUM_ACC = 8;
+
+  const int iw_32 = (input_width / 4) / 8;
+  const int iw = input_width;
+  const int iw_32_rem = (input_width / 4) % 8;
+
+  int input_offset1 = 0;
+  int input_offset2 = 0;
+  int input_offset3 = 0;
+
+  // aie::vector<int8,32> vec_tmp[NUM_ACC];
+
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int x = 0; x < iw_32; x++) {
+        aie::vector<int8, 32> vec_conv[NUM_ACC];
+        aie::vector<int8, 32> vec_skip[NUM_ACC];
+
+        MMUL4x8x8 acc_tmp[NUM_ACC];
+        { // conv section
+          // MMUL4x8x8 acc_tmp[NUM_ACC];
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            acc_tmp[x8] = aie::zeros<acc32, 32>();
+          }
+
+          for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+            // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+            // (2x 256b loads) For ic > 8, we would load the next 64 weights
+            // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+            // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++)
+            // chess_prepare_for_pipelining //chess_loop_range(7, )
+            // e.g. 28/4 = 7
+            // 13 cycles delay for vload.
+            // 7 gives us 3 cycle inner loop.
+            // 13 gave 1 cycle inner loop before partial load, not it only gets
+            // 2 cycles (not sure why?)
+            {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input0 + input_offset1);
+              input_offset1 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset1 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+            // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+            // (2x 256b loads) For ic > 8, we would load the next 64 weights
+            // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+            // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels);
+            kernels += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++)
+            // chess_prepare_for_pipelining //chess_loop_range(7, )
+            // e.g. 28/4 = 7
+            // 13 cycles delay for vload.
+            // 7 gives us 3 cycle inner loop.
+            // 13 gave 1 cycle inner loop before partial load, not it only gets
+            // 2 cycles (not sure why?)
+            {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(input1 + input_offset2);
+              input_offset2 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset2 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            vec_conv[x8] = acc_tmp[x8].to_vector<int8>(scaleT);
+          }
+        } // conv section
+
+        { // skip section
+          // MMULi4x8x8 acci_tmp[NUM_ACC];
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            acc_tmp[x8] = aie::zeros<acc32, 32>();
+          }
+
+          for (int ic = 0; ic < (input_channels_skip / 8); ic++) {
+            // For oc > 8, we would load the next 64 weights after all the ic
+            // weights {OC}{IC}{IC8}{OC8}
+            aie::vector<int8, 64> in_b = aie::load_v<64>(kernels_skip);
+            kernels_skip += 64; // wts ic0..7(oc0..7)
+
+            for (int x8 = 0; x8 < NUM_ACC; x8++) {
+              aie::vector<uint8, 32> in_a =
+                  aie::load_v<32>(skip + input_offset3);
+              input_offset3 += 32; // act oc0..3(ic0..7)
+              acc_tmp[x8].mac(in_a, in_b);
+            }
+            input_offset3 +=
+                (iw * 8) -
+                256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+          }
+          for (int x8 = 0; x8 < NUM_ACC; x8++) {
+            vec_skip[x8] = acc_tmp[x8].to_vector<int8>(scaleT_skip_conv);
+          }
+        } // skip section
+
+        // input ptr just moves to next section
+        for (int x8 = 0; x8 < NUM_ACC; x8++) {
+          aie::accum<acc32, 32> accj;
+          accj.from_vector(vec_conv[x8], 0);
+          accj = aie::add(accj, vec_skip[x8]);
+          aie::vector<uint8, 32> o1 = accj.to_vector<uint8>(skip_scaleT);
+          aie::store_v(out_ptr, o1);
+          out_ptr += 32;
+        }
+        input_offset1 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset2 -=
+            ((input_channels / 16) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        input_offset3 -=
+            ((input_channels_skip / 8) * iw * 8) -
+            256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+        kernels -=
+            (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+        kernels_skip -= (input_channels_skip / 8) *
+                        64; // reset kernel back to beginning of ic/8
+      }                     // for(int x=0; x<iw_32; x++) {
+      // input_offset -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+      input_offset1 = 0;                    // reset beginning of input ptr
+      input_offset2 = 0;                    // reset beginning of input ptr
+      input_offset3 = 0;                    // reset beginning of input ptr
+      kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+      kernels_skip +=
+          (input_channels_skip / 8) * 64; // move to next oc/8 weights
+      out_ptr += (iw_32_rem *
+                  32); // move to next oc/8 (skip remainder section if present)
+    }                  // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    out_ptr -= output_channels *
+               iw; // output_channels/8*iw_32*8*32 = 256/8*(iw/4/8)*8*32
+
+    // for(int oc=0; oc<(output_channels/8); oc++) {
+    //     for(int x=0; x<iw_32; x++) {
+    //         for(int x8=0; x8<NUM_ACC; x8++) {
+    //             // aie::vector<uint8,32> skip1 = aie::load_v<32>(skip_ptr);
+    //             skip_ptr += 32; aie::vector<int8,32> skip1 =
+    //             aie::load_v<32>(skip_ptr); skip_ptr += 32;
+    //             // aie::vector<uint8,32> tmp   = aie::load_v<32>(out_ptr);
+    //             aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+    //             i_out_ptr += 32; aie::accum<acc32,32> accj;
+    //             accj.from_vector(skip1,0);
+    //             accj = aie::mac(accj, tmp, (uint8_t)1);
+    //             aie::vector<uint8,32> o3 =
+    //             accj.to_vector<uint8>(skip_scaleT); aie::store_v(out_ptr,
+    //             o3); out_ptr += 32;
+    //         }
+    //     }
+    //     out_ptr += (iw_32_rem*32);
+    //     skip_ptr += (iw_32_rem*32);
+    // }
+
+    out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+    skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+  } // if(iw_32 > 0) {
+
+  // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+  // if(iw_32_rem > 0) {
+
+  // const int ocs = output_channels;
+  // const int ics = input_channels;
+
+  // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+  // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+  // for(int oc=0; oc<(ocs/8); oc++) {
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position, TODO -(iw_32_rem*8)??
+  //     }
+  //     for(int ic=0; ic<(ics/16); ic++) {
+  //         // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+  //         (2x 256b loads)
+  //         // For ic > 8, we would load the next 64 weights that are
+  //         ic8..15(oc0..7)
+  //         // For oc > 8, we would load the next 64 weights after all the ic
+  //         weights {OC}{IC}{IC8}{OC8} aie::vector<int8, 64> in_b =
+  //         aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+  //         for(int x=0; x<iw_32_rem; x++)
+  //             // chess_prepare_for_pipelining //chess_loop_range(7, )
+  //             // e.g. 28/4 = 7
+  //             // 13 cycles delay for vload.
+  //             // 7 gives us 3 cycle inner loop.
+  //             // 13 gave 1 cycle inner loop before partial load, not it only
+  //             gets 2 cycles (not sure why?)
+  //         {
+  //             aie::vector<uint8, 32> in_a      =
+  //             aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+  //             act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+  //         }
+  //         input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+  //         position
+  //     }
+  //     // input ptr just moves to next section
+  //     for(int xx=0; xx<iw_32_rem; xx++) {
+  //         // aie::vector<uint8,32> o1 = acc_tmp[xx].to_vector<uint8>(scaleT);
+  //         aie::vector<int8,32> o1 = acc_tmp[xx].to_vector<int8>(scaleT);
+  //         // aie::store_v(out_ptr, o1); out_ptr += 32;
+  //         aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+  //         acc_tmp[xx] = aie::zeros<acc32,32>();
+  //     }
+  //     // input   -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+  //     of input ptr for remainder input_offset1   -= 448; // reset to
+  //     beginning of input ptr for remainder input_offset2   -= 448; // reset
+  //     to beginning of input ptr for remainder
+  //     // kernel ptr already at next oc/8
+  //     i_out_ptr += (iw*8)-(iw_32_rem*32);           // move to next oc/8
+  //     (skip remainder section if present)
+  // }
+
+  // i_out_ptr -= output_channels*iw;
+
+  // for(int oc=0; oc<(output_channels/8); oc++) {
+  //     for(int x8=0; x8<NUM_ACC; x8++) {
+  //         aie::vector<int8,32> skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+  //         32; aie::vector<int8,32> tmp   = aie::load_v<32>(i_out_ptr);
+  //         aie::accum<acc32,32> accj;
+  //         accj.from_vector(skip1,0);
+  //         accj = aie::mac(accj, tmp, (uint8_t)1);
+  //         aie::vector<uint8,32> o3 = accj.to_vector<uint8>(skip_scaleT);
+  //         aie::store_v(out_ptr, o3); out_ptr += 32;
+  //     }
+  //     out_ptr += (iw*8)-(iw_32_rem*32);
+  //     skip_ptr += (iw*8)-(iw_32_rem*32);
+  // }
+
+  // } // if(iw_32_rem > 0)
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 skip init wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                           uint8_t *output, int8_t *skip,
+                           const int32_t input_width,
+                           const int32_t input_channels,
+                           const int32_t output_channels,
+                           const int32_t input_channels_skip, const int scale,
+                           const int skip_scale, const int scale_skip_conv) {
+  conv2dk1_skip_init_i8_scalar(
+      input0, input1, kernels, output, skip, input_width, input_channels,
+      output_channels, input_channels_skip, scale, skip_scale, scale_skip_conv);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                            uint8_t *output, uint8_t *skip,
+                            const int32_t input_width,
+                            const int32_t input_channels,
+                            const int32_t output_channels,
+                            const int32_t input_channels_skip, const int scale,
+                            const int skip_scale, const int scale_skip_conv) {
+  // conv2dk1_skip_init_ui8_scalar(input0, input1, kernels, output, skip,
+  // input_width, input_channels, output_channels, input_channels_skip, scale,
+  // skip_scale, scale_skip_conv);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                           uint8_t *output, int8_t *skip,
+                           const int32_t input_width,
+                           const int32_t input_channels,
+                           const int32_t output_channels,
+                           const int32_t input_channels_skip, const int scale,
+                           const int skip_scale, const int scale_skip_conv) {
+  conv2dk1_skip_init_i8_vector(
+      input0, input1, kernels, output, skip, input_width, input_channels,
+      output_channels, input_channels_skip, scale, skip_scale, scale_skip_conv);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                            uint8_t *output, uint8_t *skip,
+                            const int32_t input_width,
+                            const int32_t input_channels,
+                            const int32_t output_channels,
+                            const int32_t input_channels_skip, const int scale,
+                            const int skip_scale, const int scale_skip_conv) {
+  // conv2dk1_skip_init_ui8_vector(input0, input1, kernels, output, skip,
+  // input_width, input_channels, output_channels, input_channels_skip, scale,
+  // skip_scale, scale_skip_conv);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip_init.h b/aie_kernels/aie2/conv2dk1_skip_init.h
new file mode 100755
index 0000000000..cfb4b8b467
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip_init.h
@@ -0,0 +1,33 @@
+//===- conv2dk1_skip_init.h -------------------------------------------------*-
+// C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_SKIP_INIT_H
+#define _CONV2DK1_SKIP_INIT_H
+
+extern "C" {
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                           uint8_t *output, int8_t *skip,
+                           const int32_t input_width,
+                           const int32_t input_channels,
+                           const int32_t output_channels, const int scale,
+                           const int skip_scale);
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+                            uint8_t *output, uint8_t *skip,
+                            const int32_t input_width,
+                            const int32_t input_channels,
+                            const int32_t output_channels, const int scale,
+                            const int skip_scale);
+
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk3.cc b/aie_kernels/aie2/conv2dk3.cc
new file mode 100755
index 0000000000..e0f3d9e1b5
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk3.cc
@@ -0,0 +1,1434 @@
+//===- conv2dk3.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <aie_api/aie.hpp>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+enum region { top, middle, bottom };
+
+#ifdef SCALAR
+
+const int32_t MAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - scalar
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_i8_scalar(int8_t *line0, int8_t *line1, int8_t *line2,
+                        int8_t *wts, uint8_t *output, const int32_t input_width,
+                        const int32_t input_channels,
+                        const int32_t output_channels,
+                        const int32_t kernel_width, const int32_t kernel_height,
+                        const int32_t check, const int scale,
+                        const int channel_offset) {
+  event0();
+
+  int x, ki, ic, oc, ic8, oc8;
+  int32_t sum;
+  int sum_srs;
+  int wts_indx_0 = 0, wts_indx_1 = 0, wts_indx_2 = 0;
+  int in_indx_0 = 0;
+  // for (oc = (0+channel_offset)/8; oc < (output_channels+channel_offset)/8;
+  // oc++) {
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    int oc_ofst = oc + (channel_offset / 8);
+    for (oc8 = 0; oc8 < 8; oc8++) {
+
+      // left border
+      sum = 0;
+      sum_srs = 0;
+      for (ic = 0; ic < input_channels / 8; ic++) {
+        for (ic8 = 0; ic8 < 8; ic8++) {
+          for (ki = 1; ki < kernel_width; ki++) {
+
+            // replicate 1 border pixel on the left
+            // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+            // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+            // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc;
+            int wts_indx_0 =
+                (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_1 =
+                (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_2 =
+                (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+            if (ki == 0) {
+              // in_indx_0=0+ki+input_width*ic;
+              in_indx_0 = (0 + ki) * 8 + ((ic * input_width * 8) + ic8);
+            } else {
+              // in_indx_0=0+ki-1+input_width*ic;
+              in_indx_0 = (0 + ki - 1) * 8 + ((ic * input_width * 8) + ic8);
+            }
+
+            if (check != top)
+              sum += line0[in_indx_0] * wts[wts_indx_0];
+            sum += line1[in_indx_0] * wts[wts_indx_1];
+            if (check != bottom)
+              sum += line2[in_indx_0] * wts[wts_indx_2];
+          }
+        }
+      }
+      // output[oc * (input_width) +  0] = sum;
+      sum_srs = (sum + (1 << (scale - 1))) >> scale;
+      sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+      // output[oc * input_width + 0] = sum_srs;
+      output[(oc * input_width * 8) + oc8] = sum_srs;
+
+      // right border
+      sum = 0;
+      sum_srs = 0;
+      for (ic = 0; ic < input_channels / 8; ic++) {
+        for (ic8 = 0; ic8 < 8; ic8++) {
+          for (ki = 0; ki < kernel_width - 1; ki++) {
+            // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+            // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+            // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc;
+            int wts_indx_0 =
+                (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_1 =
+                (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_2 =
+                (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+            if (ki != 2) {
+              // in_indx_0=input_width-2+ki+input_width*ic;
+              in_indx_0 =
+                  (input_width - 2 + ki) * 8 + ((ic * input_width * 8) + ic8);
+            } else { // replicate 1 border pixel on the right
+              // in_indx_0=input_width-2+ki-1+input_width*ic;
+              in_indx_0 = (input_width - 2 + ki - 1) * 8 +
+                          ((ic * input_width * 8) + ic8);
+            }
+            if (check != top)
+              sum += line0[in_indx_0] * wts[wts_indx_0];
+            sum += line1[in_indx_0] * wts[wts_indx_1];
+            if (check != bottom)
+              sum += line2[in_indx_0] * wts[wts_indx_2];
+          }
+        }
+      }
+      sum_srs = (sum + (1 << (scale - 1))) >> scale;
+      sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+      // output[oc * input_width + input_width-1] = sum_srs;
+      output[(oc * input_width * 8) + (input_width - 1) * 8 + oc8] = sum_srs;
+      // output[oc * (input_width) +  input_width-1] = sum;
+
+      for (x = 1; x < input_width - 1; x++) { // col of output image
+        sum = 0;
+        sum_srs = 0;
+        for (ic = 0; ic < input_channels / 8; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            for (ki = 0; ki < kernel_width; ki++) {
+              // wts format - orig is oc,ic,ky,kx, reformat is
+              // oc,ic,k0..k8,ic8,oc8
+
+              // int wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+              // 3*kernel_width*input_channels*oc; int wts_indx_1=1*3 + ki +
+              // 3*kernel_width*ic + 3*kernel_width*input_channels*oc; int
+              // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+              // 3*kernel_width*input_channels*oc;
+              int wts_indx_0 =
+                  (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+              int wts_indx_1 =
+                  (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+              int wts_indx_2 =
+                  (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+
+              // int in_indx_0=x-1+ki+input_width*ic;
+              int in_indx_0 = (x - 1 + ki) * 8 + ((ic * input_width * 8) + ic8);
+
+              if (check != top)
+                sum += line0[in_indx_0] * wts[wts_indx_0];
+              sum += line1[in_indx_0] * wts[wts_indx_1];
+              if (check != bottom)
+                sum += line2[in_indx_0] * wts[wts_indx_2];
+            }
+          }
+        }
+        sum_srs = (sum + (1 << (scale - 1))) >> scale;
+        sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+        output[(oc * input_width * 8) + x * 8 + oc8] = sum_srs;
+        // output[oc * (input_width) +  x] = sum;
+      }
+    }
+  }
+
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - scalar
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_ui8_scalar(uint8_t *line0, uint8_t *line1, uint8_t *line2,
+                         int8_t *wts, uint8_t *output,
+                         const int32_t input_width,
+                         const int32_t input_channels,
+                         const int32_t output_channels,
+                         const int32_t kernel_width,
+                         const int32_t kernel_height, const int32_t check,
+                         const int scale, const int channel_offset) {
+  event0();
+
+  int x, ki, ic, oc, ic8, oc8;
+  int32_t sum;
+  int sum_srs;
+  int wts_indx_0 = 0, wts_indx_1 = 0, wts_indx_2 = 0;
+  int in_indx_0 = 0;
+  // for (oc = (0+channel_offset)/8; oc < (output_channels+channel_offset)/8;
+  // oc++) {
+  for (oc = 0; oc < output_channels / 8; oc++) {
+    int oc_ofst = oc + (channel_offset / 8);
+    for (oc8 = 0; oc8 < 8; oc8++) {
+
+      // left border
+      sum = 0;
+      sum_srs = 0;
+      for (ic = 0; ic < input_channels / 8; ic++) {
+        for (ic8 = 0; ic8 < 8; ic8++) {
+          for (ki = 1; ki < kernel_width; ki++) {
+
+            // replicate 1 border pixel on the left
+            // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+            // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+            // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc;
+            int wts_indx_0 =
+                (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_1 =
+                (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_2 =
+                (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+            if (ki == 0) {
+              // in_indx_0=0+ki+input_width*ic;
+              in_indx_0 = (0 + ki) * 8 + ((ic * input_width * 8) + ic8);
+            } else {
+              // in_indx_0=0+ki-1+input_width*ic;
+              in_indx_0 = (0 + ki - 1) * 8 + ((ic * input_width * 8) + ic8);
+            }
+
+            if (check != top)
+              sum += line0[in_indx_0] * wts[wts_indx_0];
+            sum += line1[in_indx_0] * wts[wts_indx_1];
+            if (check != bottom)
+              sum += line2[in_indx_0] * wts[wts_indx_2];
+          }
+        }
+      }
+      // output[oc * (input_width) +  0] = sum;
+      sum_srs = (sum + (1 << (scale - 1))) >> scale;
+      sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+      // output[oc * input_width + 0] = sum_srs;
+      output[(oc * input_width * 8) + oc8] = sum_srs;
+
+      // right border
+      sum = 0;
+      sum_srs = 0;
+      for (ic = 0; ic < input_channels / 8; ic++) {
+        for (ic8 = 0; ic8 < 8; ic8++) {
+          for (ki = 0; ki < kernel_width - 1; ki++) {
+            // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+            // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+            // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+            // 3*kernel_width*input_channels*oc;
+            int wts_indx_0 =
+                (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_1 =
+                (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+            int wts_indx_2 =
+                (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                (ic8 * 8) +
+                (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+            if (ki != 2) {
+              // in_indx_0=input_width-2+ki+input_width*ic;
+              in_indx_0 =
+                  (input_width - 2 + ki) * 8 + ((ic * input_width * 8) + ic8);
+            } else { // replicate 1 border pixel on the right
+              // in_indx_0=input_width-2+ki-1+input_width*ic;
+              in_indx_0 = (input_width - 2 + ki - 1) * 8 +
+                          ((ic * input_width * 8) + ic8);
+            }
+            if (check != top)
+              sum += line0[in_indx_0] * wts[wts_indx_0];
+            sum += line1[in_indx_0] * wts[wts_indx_1];
+            if (check != bottom)
+              sum += line2[in_indx_0] * wts[wts_indx_2];
+          }
+        }
+      }
+      sum_srs = (sum + (1 << (scale - 1))) >> scale;
+      sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+      // output[oc * input_width + input_width-1] = sum_srs;
+      output[(oc * input_width * 8) + (input_width - 1) * 8 + oc8] = sum_srs;
+      // output[oc * (input_width) +  input_width-1] = sum;
+
+      for (x = 1; x < input_width - 1; x++) { // col of output image
+        sum = 0;
+        sum_srs = 0;
+        for (ic = 0; ic < input_channels / 8; ic++) {
+          for (ic8 = 0; ic8 < 8; ic8++) {
+            for (ki = 0; ki < kernel_width; ki++) {
+              // wts format - orig is oc,ic,ky,kx, reformat is
+              // oc,ic,k0..k8,ic8,oc8
+
+              // int wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+              // 3*kernel_width*input_channels*oc; int wts_indx_1=1*3 + ki +
+              // 3*kernel_width*ic + 3*kernel_width*input_channels*oc; int
+              // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+              // 3*kernel_width*input_channels*oc;
+              int wts_indx_0 =
+                  (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+              int wts_indx_1 =
+                  (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+              int wts_indx_2 =
+                  (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+                  (ic8 * 8) +
+                  (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+                  oc8;
+
+              // int in_indx_0=x-1+ki+input_width*ic;
+              int in_indx_0 = (x - 1 + ki) * 8 + ((ic * input_width * 8) + ic8);
+
+              if (check != top)
+                sum += line0[in_indx_0] * wts[wts_indx_0];
+              sum += line1[in_indx_0] * wts[wts_indx_1];
+              if (check != bottom)
+                sum += line2[in_indx_0] * wts[wts_indx_2];
+            }
+          }
+        }
+        sum_srs = (sum + (1 << (scale - 1))) >> scale;
+        sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+        output[(oc * input_width * 8) + x * 8 + oc8] = sum_srs;
+        // output[oc * (input_width) +  x] = sum;
+      }
+    }
+  }
+
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - vector
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_i8_vector(int8_t *line0, int8_t *line1, int8_t *line2,
+                        int8_t *wts, uint8_t *output, const int32_t input_width,
+                        const int32_t input_channels,
+                        const int32_t output_channels,
+                        const int32_t kernel_width, const int32_t kernel_height,
+                        const int32_t check, const int scale,
+                        const int channel_offset) {
+  event0();
+
+  // Compute
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  constexpr unsigned VecFactor = 16;
+
+  // const int scale = 11;
+
+  // basic MMUL intrinisic needed is k x ic x oc
+  // k is number of inputs processed at a time
+  // So if ic=8, oc=4, then k=8 and we use 8x8x4
+  const unsigned k =
+      256 / (input_channels * output_channels); // 8 inputs per vector output
+
+  aie::vector<int8, 32> zero32 = aie::zeros<int8, 32>();
+
+  // aie::vector<int8, 64> prev_a[3],
+  // aie::vector<int8, 64> in_a;
+  // aie::vector<int8, 64> in_b;
+  // aie::vector<int8, 64> tmp_a;
+  // aie::vector<int8, 32> tmp_a1, tmp_a2;
+
+  // int8_t * restrict line[3];
+  int8_t *line[3];
+  line[0] = line0;
+  line[1] = line1;
+  line[2] = line2;
+
+  // int8_t * restrict wtsLine[3];
+  int8_t *wtsLine[3];
+  // oc,ic,ky,kx,ic8,oc8
+  wtsLine[0] = wts + (channel_offset / 8) * (input_channels / 8) *
+                         kernel_height * kernel_width * 64;
+  wtsLine[1] = wts +
+               (channel_offset / 8) * (input_channels / 8) * kernel_height *
+                   kernel_width * 64 +
+               kernel_width * 64; // next kernel line is always 8*8 away
+  wtsLine[2] = wts +
+               (channel_offset / 8) * (input_channels / 8) * kernel_height *
+                   kernel_width * 64 +
+               2 * kernel_width * 64; // next kernel line is always 8*8 away
+
+  MMUL4x8x8 acc_tmp[8];
+
+  // Zero accumulators used for storing partial results
+  // for(int x=0; x<input_width/4-2; x++) {
+  // for(int x=0; x<(iw/4)-2; x++) {
+  for (int x = 0; x < 8; x++) {
+    acc_tmp[x] = aie::zeros<acc32, 32>();
+  }
+
+  // TODO temporary workaround. When assigned to input_width, the results are
+  // wrong. ???
+  const int iw = 32;
+  // const int32_t iw = input_width;
+
+  // const int iw_32 = ((input_width/4)-2)/8;
+  // const int iw_32 = ((iw/4)-2)/8;
+  // const int iw_32 = ((32/4)-2)/8;
+  const int iw_32 = 0;
+
+  // const int iw_32_rem = ((input_width/4)-2) % 8;
+  // const int iw_32_rem = ((iw/4)-2) % 8;
+  // const int iw_32_rem = ((32/4)-2) % 8;
+  const int iw_32_rem = 6;
+
+  // output += (channel_offset*iw); // channel_offset/8*iw*8
+
+  int kernel_height_start;
+  int kernel_height_end;
+
+  // int kernel_height_start, kernel_height_end;
+#ifdef BORDER_REPLICATE
+  kernel_height_start = 0;
+  kernel_height_end = kernel_height;
+  // constexpr int kernel_height_start = 0;
+  // constexpr int kernel_height_end   = kernel_height;
+#else // Zero border for 3x3
+  // constexpr int kernel_height_start = 0;
+  // constexpr int kernel_height_end   = kernel_height-1;
+
+  // if(check == top)
+  //     idx_adj = 1;
+
+  // We skip top or bottom row for zero border
+  switch (check) {
+  case top:
+    kernel_height_start = 1;
+    kernel_height_end = kernel_height;
+    break;
+  case middle:
+    kernel_height_start = 0;
+    kernel_height_end = kernel_height;
+    break;
+  case bottom:
+    kernel_height_start = 0;
+    kernel_height_end = kernel_height - 1;
+    break;
+  }
+#endif
+
+  // --------------------------------------------------------------------
+  // Leftmost pattern
+  // --------------------------------------------------------------------
+  // Computes leftmost 4 inputs for all input/output channels.
+  // This shifts the leftmost input data by 1 (x8 channels) for 3x3 to
+  // account for border. Border replicate copies the leftmost input while
+  // 0 border shifts in 0's. If we need to support larger than 3x3, the
+  // replicate logic would need to be changed.
+  // --------------------------------------------------------------------
+  {
+    // in_b = aie::load_v<64>(wtsLine[kernel_height_start]);
+    // wtsLine[kernel_height_start] +=64;       // wts ic0..7(oc0..7)
+
+    MMUL4x8x8 acc1 = aie::zeros<acc32, 32>();
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++) {
+        for (int i = kernel_height_start; i < kernel_height_end; i++)
+          chess_prepare_for_pipelining chess_loop_range(2, )
+          // chess_unroll_loop()
+          {
+            // aie::vector<int8, 32> tmp_a1, tmp_a2;
+            // Load input data [a0 a1 a2 a3 a4 a5 a6 a7] where each position has
+            // data for 8 channels
+            auto tmp_a1 = aie::load_v<32>(line[i]);
+            line[i] += 32; // act 0..3 (ic0..7 for each)
+            auto tmp_a2 =
+                aie::load_v<32>(line[i]); // act 4..7 (ic0..7 for each)
+            auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+#ifdef BORDER_REPLICATE
+            tmp_a1 = aie::shuffle_up(tmp_a1, 24);
+            tmp_a.insert<32>(1, tmp_a1);
+#else
+            tmp_a = aie::zeros<int8, 64>();
+#endif
+            // Shift right 1 input (8 channels) [- a0 a1 a2 a3 a4 a5 a6] where -
+            // is either a0 or 0's
+            in_a = aie::shuffle_up_fill(in_a, tmp_a, 8);
+
+            // Previous buffer stores shifted data, [- - - - a0 a1 a2 a3]
+            // where - is
+            // prev_a[i] = aie::shuffle_up(in_a, 24); // Shift right (4-1)*8
+
+            // prev_a[i] = in_a;
+            // prev_a[i] = aie::shuffle_up(prev_a[i], 24); // Shift right
+            // (4-1)*8
+
+            // For kernel width, we load 64 weights (8 ics x 8 ocs) and multiply
+            // it with the act buffer. acc[32] += in_a[32] * wts[64] We then
+            // shift the buffer left by 1 data position (8 channels).
+            for (int j = 0; j < kernel_width; j++)
+            // chess_unroll_loop()
+            {
+              auto in_b = aie::load_v<64>(wtsLine[i]);
+              wtsLine[i] += 64; // wts ic0..7(oc0..7)
+              acc1.mac(in_a.extract<32>(0), in_b);
+              // Shift input A by 1 row (1x8) which is by 1 (the 8 is the ic=8)
+              in_a = aie::shuffle_down(in_a, 8);
+            }
+            wtsLine[i] -=
+                (kernel_width * 64); // Reset weight pointer for this line
+            // wtsLine[i] += ((kernel_height-1)*kernel_width*64); // Move to
+            // next ic/8 position No need to load next set of weights because
+            // next row of weights immediately follows line[i] += (iw*4)*8; //
+            // Increment to next ic/8 position (reset at end of outermost loop)
+          } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+
+        // Reset weights and input pointer for next ic/8
+        for (int i = kernel_height_start; i < kernel_height_end; i++) {
+          wtsLine[i] += kernel_width * kernel_height *
+                        64; // kernel_width*kernel_height*8*8
+          line[i] += (iw - 4) *
+                     8; // (iw-4)*8, length of act minus 1 vlds to shift back
+        }
+      } // for(int ic=0; ic<(input_channels/8); ic++) {
+
+      // SRS results to uint8 and store
+      aie::vector<uint8, 32> o1 = acc1.to_vector<uint8>(scale);
+      aie::store_v(output, o1);
+      output += iw * 8; // Shift to next oc/8 offset for left side
+
+      acc1 = aie::zeros<acc32, 32>();
+
+      // Shift back to beginning of input
+      for (int i = kernel_height_start; i < kernel_height_end; i++) {
+        line[i] -= (input_channels / 8) * (iw * 8);
+      }
+
+    } // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    // Reset output to beginning, then add 4*8
+    // Reset wts to beginning of wts
+    // Reset line to beginning of input, then add 4*8
+    output -= (output_channels / 8) * (iw * 8) - 32;
+    for (int i = kernel_height_start; i < kernel_height_end; i++) {
+      wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                    kernel_width * kernel_height *
+                    64; // kernel_width*kernel_height*8*8
+      // line[i]    -= (output_channels/8)*(input_channels/8)*(iw*8)-32; //
+      line[i] += 32;
+    }
+  }
+
+  // --------------------------------------------------------------------
+  // Middle pattern
+  // --------------------------------------------------------------------
+  // The middle seciton algorithm is different because we want to minimize
+  // the reloading of weights and activations. So instead, we use up to 8
+  // accumulators to store partial products with activations being shifted.
+  // Then for the next kernel position, we reload weights.
+  //
+  // H,W,C8
+  // --------------------------------------------------------------------
+
+  // Main loop for when input_width/4-2 > 8
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+        for (int ic = 0; ic < (input_channels / 8); ic++) {
+          for (int i = kernel_height_start; i < kernel_height_end;
+               i++) { // 1 to 3
+
+            for (int j = 0; j < kernel_width; j++) {
+              aie::vector<int8, 64> wtsVec = aie::load_v<64>(wtsLine[i]);
+              wtsLine[i] += 64;
+
+              // auto prev = prev_a[i].extract<32>(1);                  // prev
+              // = x0..x3(ci0..ci7)
+              auto prev = aie::load_v<32>((line[i] - 32));
+              auto curr = aie::load_v<32>((line[i]));
+              line[i] += 32;
+              auto next = aie::load_v<32>((line[i]));
+              line[i] += 32;
+
+              for (int x = 0; x < 8; x++)
+              // chess_unroll_loop()
+              {
+                auto tmp1 = aie::concat(curr, next);
+                auto tprev = aie::concat(zero32, prev);
+                auto tmp2 = aie::shuffle_up_fill(
+                    tmp1, tprev, 8); // curr      = x3..x6(ci0..ci7)
+                auto tmp3 = aie::shuffle_down(
+                    tmp2, j * 8); // curr      = x4..x7(ci0..ci7) to
+                                  // x5..x8(ci0..ci7)ss
+
+                prev = curr;
+                curr = next;
+                next = aie::load_v<32>(line[i]);
+                line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+                acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+              }               // for(int x=0; x<8; x++)
+              line[i] -= 320; // (8+2)*32, Reset line buffer ptr to beginning of
+                              // line (after first 4)
+            }                 // for(int j=0; j<kernel_width;j++) {
+            wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                           64);  // Move to next ic/8 position
+            line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+                                 // end of outermost loop)
+
+          } // for(int i=kernel_height_start; i<kernel_height_end; i++) { // 1
+            // to 3
+        }   // for(int ic=0; ic<(input_channels/8); ic++) {
+        for (int x = 0; x < 8; x++) {
+          aie::vector<uint8, 32> o1 = acc_tmp[x].to_vector<uint8>(scale);
+          aie::store_v(output, o1);
+          output += 32;
+          acc_tmp[x] = aie::zeros<acc32, 32>();
+        }
+        // For next 8 activations, reset line buffer and weights
+        for (int i = kernel_height_start; i < kernel_height_end; i++) {
+          line[i] -=
+              (input_channels / 8) * (iw * 8); // length of act to shift back
+        }
+      } // for(int iw_32c=0; iw_32c<iw_32; iw_32c++) {
+      output +=
+          (iw_32_rem * 32 +
+           32); // Shift past remainder output and left section of next oc/8
+    }           //     for(int oc=0; oc<(output_channels/8); oc++) {
+
+    // Reset weights and line buffers for last section of middle (or right side
+    // it there is no last section)
+    for (int i = kernel_height_start; i < kernel_height_end; i++) {
+      wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                    kernel_width * kernel_height *
+                    64; // kernel_width*kernel_height*8*8
+      // TODO line already shifted back to next data
+      line[i] += iw_32 * 256; // 8*4*8, shift to beginnign of secondary loop }
+    }
+    output -= (output_channels / 8) * (iw * 8) - (iw_32 * 32); // 32 = 4*8
+
+  } // if(iw_32 > 0)
+
+  // Secondary loop for input_width remainder (iw_32_rem < 8)
+  if (iw_32_rem > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++) {
+        for (int i = kernel_height_start; i < kernel_height_end;
+             i++) { // 1 to 3
+          for (int j = 0; j < kernel_width; j++) {
+            // New weight every kernel_width
+            aie::vector<int8, 64> wtsVec = aie::load_v<64>(wtsLine[i]);
+            wtsLine[i] += 64;
+            // auto prev = prev_a[i].extract<32>(1);                  // prev =
+            // x0..x3(ci0..ci7)
+            auto prev = aie::load_v<32>((line[i] - 32));
+            auto curr = aie::load_v<32>((line[i]));
+            line[i] += 32;
+            auto next = aie::load_v<32>((line[i]));
+            line[i] += 32;
+
+            for (int x = 0; x < iw_32_rem; x++) // remainder input width < 8
+                                                // chess_unroll_loop()
+            {
+              auto tmp1 = aie::concat(curr, next);
+              auto tprev = aie::concat(zero32, prev);
+              auto tmp2 = aie::shuffle_up_fill(
+                  tmp1, tprev, 8); // curr      = x3..x6(ci0..ci7)
+              auto tmp3 = aie::shuffle_down(
+                  tmp2,
+                  j * 8); // curr      = x3..x6(ci0..ci7) to x5..x8(ci0..ci7)ss
+
+              prev = curr;
+              curr = next;
+              next = aie::load_v<32>(line[i]);
+              line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+              acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+            }
+            line[i] -=
+                (iw_32_rem + 2) * 32; // Reset line buffer ptr to beginning of
+                                      // line (after first 4)
+          }                           //  for(int j=0; j<kernel_width;j++)
+          wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                         64);  // Move to next ic/8 position
+          line[i] += (iw * 8); // Increment to next ic/8 position (reset at end
+                               // of outermost loop)
+        } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+        // For next 8 input channels, line buffer and weights are automatically
+        // incremented to the right offset
+      } // for(int ic=0; ic<(input_channels/8); ic++)
+      // Write output from accumulator
+      for (int x = 0; x < iw_32_rem; x++) {
+        aie::vector<uint8, 32> o1 = acc_tmp[x].to_vector<uint8>(scale);
+        aie::store_v(output, o1);
+        output += 32;
+        acc_tmp[x] = aie::zeros<acc32, 32>(); // Reset accumulators
+      }
+      // Reset line ptr to beginning of input
+      for (int i = kernel_height_start; i < kernel_height_end; i++) {
+        line[i] -= (input_channels / 8) * (iw * 8);
+      }
+      // Output ptr should be in the right place (next oc/8)
+      output += (iw * 8) - (iw_32_rem * 32); // 32 = 4*8, shift to next oc/8
+    } // for(int oc=0; oc<(output_channels/8); oc++)
+    // Reset weights and line buffers for right side
+    for (int i = kernel_height_start; i < kernel_height_end; i++) {
+      wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                    kernel_width * kernel_height *
+                    64; // kernel_width*kernel_height*8*8
+      line[i] +=
+          iw_32_rem * 32; // shift to beginnign of right data, iw_32_rem*4*8
+    }
+    // shift back so we're aligned with beginning of first oc/8 (rightmost 4
+    // data)
+    output -= (output_channels / 8) * (iw * 8) - (iw_32_rem * 32);
+
+  } // if (iw_32_rem > 0) {
+
+  // --------------------------------------------------------------------
+  // Right patterns
+  // --------------------------------------------------------------------
+  //
+  // --------------------------------------------------------------------
+  {
+    MMUL4x8x8 acc1 = aie::zeros<acc32, 32>();
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++) {
+        for (int i = kernel_height_start; i < kernel_height_end; i++)
+          chess_prepare_for_pipelining chess_loop_range(2, )
+          // chess_unroll_loop()
+          {
+            // Load next set of data for input A (matrix row), need stride info
+            // or line1/2/3 pointer
+            // TODO, did not store previous so need to load it again
+            // in_a   = aie::load_v<64>(line[i]-32);
+            auto tmp_a1 =
+                aie::load_v<32>(line[i] - 32); // act 24..27 (ic0..7 for each)
+            auto tmp_a2 =
+                aie::load_v<32>(line[i]); // act 28..31 (ic0..7 for each)
+            auto in_a = aie::concat(tmp_a1, tmp_a2);
+#ifdef BORDER_REPLICATE
+            tmp_a2 = aie::shuffle_down(tmp_a2, 24);
+            tmp_a.insert<32>(0, tmp_a2);
+#else
+            auto tmp_a = aie::zeros<int8, 64>();
+#endif
+            // shift by 32-8 (fill 32 then shift up by 8)
+            in_a = aie::shuffle_down_fill(in_a, tmp_a, 24); // act 27..31 - - -
+
+            for (int j = 0; j < kernel_width; j++)
+            // chess_unroll_loop()
+            {
+              auto in_b = aie::load_v<64>(wtsLine[i]);
+              wtsLine[i] += 64; // wts ic0..7(oc0..7)
+              acc1.mac(in_a.extract<32>(0), in_b);
+              // Shift input A by 1 row (1x8) which is by 1 (the 8 is the ic=8)
+              in_a = aie::shuffle_down(in_a, 8);
+            }
+            wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                           64); // Move to next ic/8 position
+            // No need to load next set of weights because next row of weights
+            // immediately follows
+            line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+                                 // end of outermost loop)
+          } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+
+      } // for(int ic=0; ic<(input_channels/8); ic++) {
+
+      // Write output 4 outputs, 8 channels
+      aie::vector<uint8, 32> o1 = acc1.to_vector<uint8>(scale);
+      aie::store_v(output, o1);
+      output += iw * 8; // Shift to next oc/8
+
+      acc1 = aie::zeros<acc32, 32>();
+
+      for (int i = kernel_height_start; i < kernel_height_end; i++) {
+        line[i] -= (input_channels / 8) *
+                   (iw * 8); // shift back to beginning of this section
+      }
+    } // for(int oc=0; oc<(output_channels/8); oc++) {
+  }
+  event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - vector
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+// Takes 3 input lines and computes 1 output line
+void conv2dk3_ui8_vector(uint8_t *line0, uint8_t *line1, uint8_t *line2,
+                         int8_t *wts, uint8_t *output,
+                         const int32_t input_width,
+                         const int32_t input_channels,
+                         const int32_t output_channels,
+                         const int32_t kernel_width,
+                         const int32_t kernel_height, const int32_t check,
+                         const int scale, const int channel_offset) {
+  event0();
+
+  // Compute
+  using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+  ::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+  constexpr unsigned VecFactor = 16;
+
+  // const int scale = 11;
+
+  // basic MMUL intrinisic needed is k x ic x oc
+  // k is number of inputs processed at a time
+  // So if ic=8, oc=4, then k=8 and we use 8x8x4
+  const unsigned k =
+      256 / (input_channels * output_channels); // 8 inputs per vector output
+
+  aie::vector<uint8, 32> zero32 = aie::zeros<uint8, 32>();
+
+  // aie::vector<uint8, 64> prev_a[3],
+  // aie::vector<uint8, 64> in_a;
+  // aie::vector<uint8, 64> tmp_a;
+  // aie::vector<uint8, 32> tmp_a1, tmp_a2;
+  // aie::vector<int8, 64> in_b;
+
+  uint8_t *restrict line[3];
+  // uint8_t *line[3];
+  line[0] = line0;
+  line[1] = line1;
+  line[2] = line2;
+
+  int8_t *restrict wtsLine[3];
+  // int8_t *wtsLine[3];
+  // oc,ic,ky,kx,ic8,oc8
+  wtsLine[0] = wts + (channel_offset / 8) * (input_channels / 8) *
+                         kernel_height * kernel_width * 64;
+  wtsLine[1] = wts +
+               (channel_offset / 8) * (input_channels / 8) * kernel_height *
+                   kernel_width * 64 +
+               kernel_width * 64; // next kernel line is always 8*8 away
+  wtsLine[2] = wts +
+               (channel_offset / 8) * (input_channels / 8) * kernel_height *
+                   kernel_width * 64 +
+               2 * kernel_width * 64; // next kernel line is always 8*8 away
+
+  MMUL4x8x8 acc_tmp[8];
+
+  // Zero accumulators used for storing partial results
+  // for(int x=0; x<input_width/4-2; x++) {
+  // for(int x=0; x<(iw/4)-2; x++) {
+  for (int x = 0; x < 8; x++) {
+    acc_tmp[x] = aie::zeros<acc32, 32>();
+  }
+
+  // TODO temporary workaround. When assigned to input_width, the results are
+  // wrong. ???
+  const int iw = 32;
+  // const int32_t iw = input_width;
+
+  // const int iw_32 = ((input_width/4)-2)/8;
+  // const int iw_32 = ((iw/4)-2)/8;
+  // const int iw_32 = ((32/4)-2)/8;
+  const int iw_32 = 0;
+
+  // const int iw_32_rem = ((input_width/4)-2) % 8;
+  // const int iw_32_rem = ((iw/4)-2) % 8;
+  // const int iw_32_rem = ((32/4)-2) % 8;
+  const int iw_32_rem = 6;
+
+  // output += (channel_offset*iw); // channel_offset/8*iw*8
+
+  int kernel_height_start;
+  int kernel_height_end;
+
+  // int kernel_height_start, kernel_height_end;
+#ifdef BORDER_REPLICATE
+  kernel_height_start = 0;
+  kernel_height_end = kernel_height;
+  // constexpr int kernel_height_start = 0;
+  // constexpr int kernel_height_end   = kernel_height;
+#else // Zero border for 3x3
+  // constexpr int kernel_height_start = 0;
+  // constexpr int kernel_height_end   = kernel_height-1;
+
+  // if(check == top)
+  //     idx_adj = 1;
+
+  // We skip top or bottom row for zero border
+  switch (check) {
+  case top:
+    kernel_height_start = 1;
+    kernel_height_end = kernel_height;
+    break;
+  case middle:
+    kernel_height_start = 0;
+    kernel_height_end = kernel_height;
+    break;
+  case bottom:
+    kernel_height_start = 0;
+    kernel_height_end = kernel_height - 1;
+    break;
+  }
+#endif
+
+  // --------------------------------------------------------------------
+  // Leftmost pattern
+  // --------------------------------------------------------------------
+  // Computes leftmost 4 inputs for all input/output channels.
+  // This shifts the leftmost input data by 1 (x8 channels) for 3x3 to
+  // account for border. Border replicate copies the leftmost input while
+  // 0 border shifts in 0's. If we need to support larger than 3x3, the
+  // replicate logic would need to be changed.
+  // --------------------------------------------------------------------
+  {
+    // in_b = aie::load_v<64>(wtsLine[kernel_height_start]);
+    // wtsLine[kernel_height_start] +=64;       // wts ic0..7(oc0..7)
+
+    MMUL4x8x8 acc1 = aie::zeros<acc32, 32>();
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++)
+        chess_loop_range(2, ) {
+          for (int i = kernel_height_start; i < kernel_height_end; i++)
+            chess_prepare_for_pipelining chess_loop_range(2, )
+            // chess_unroll_loop()
+            {
+              // Load input data [a0 a1 a2 a3 a4 a5 a6 a7] where each position
+              // has data for 8 channels
+              auto tmp_a1 = aie::load_v<32>(line[i]);
+              line[i] += 32; // act 0..3 (ic0..7 for each)
+              auto tmp_a2 =
+                  aie::load_v<32>(line[i]); // act 4..7 (ic0..7 for each)
+              auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+              aie::vector<uint8, 64> tmp_a;
+#ifdef BORDER_REPLICATE
+              tmp_a1 = aie::shuffle_up(tmp_a1, 24);
+              tmp_a.insert<32>(1, tmp_a1);
+#else
+              tmp_a = aie::zeros<uint8, 64>();
+#endif
+              // Shift right 1 input (8 channels) [- a0 a1 a2 a3 a4 a5 a6] where
+              // - is either a0 or 0's
+              in_a = aie::shuffle_up_fill(in_a, tmp_a, 8);
+
+              // Previous buffer stores shifted data, [- - - - a0 a1 a2 a3]
+              // where - is
+              // prev_a[i] = aie::shuffle_up(in_a, 24); // Shift right (4-1)*8
+
+              // prev_a[i] = in_a;
+              // prev_a[i] = aie::shuffle_up(prev_a[i], 24); // Shift right
+              // (4-1)*8
+
+              // For kernel width, we load 64 weights (8 ics x 8 ocs) and
+              // multiply it with the act buffer. acc[32] += in_a[32] * wts[64]
+              // We then shift the buffer left by 1 data position (8 channels).
+              for (int j = 0; j < kernel_width; j++)
+                chess_loop_range(3, 3) // TODO Assume 3x3
+                    chess_unroll_loop() {
+                  auto in_b = aie::load_v<64>(wtsLine[i]);
+                  wtsLine[i] += 64; // wts ic0..7(oc0..7)
+                  acc1.mac(in_a.extract<32>(0), in_b);
+                  // Shift input A by 1 row (1x8) which is by 1 (the 8 is the
+                  // ic=8)
+                  in_a = aie::shuffle_down(in_a, 8);
+                }
+              wtsLine[i] -=
+                  (kernel_width * 64); // Reset weight pointer for this line
+              // wtsLine[i] += ((kernel_height-1)*kernel_width*64); // Move to
+              // next ic/8 position No need to load next set of weights because
+              // next row of weights immediately follows line[i] += (iw*4)*8; //
+              // Increment to next ic/8 position (reset at end of outermost
+              // loop)
+            } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+
+          // Reset weights and input pointer for next ic/8
+          for (int i = kernel_height_start; i < kernel_height_end; i++)
+            chess_loop_range(2, ) {
+              wtsLine[i] += kernel_width * kernel_height *
+                            64; // kernel_width*kernel_height*8*8
+              line[i] +=
+                  (iw - 4) *
+                  8; // (iw-4)*8, length of act minus 1 vlds to shift back
+            }
+        } // for(int ic=0; ic<(input_channels/8); ic++) {
+
+      // SRS results to uint8 and store
+      aie::vector<uint8, 32> o1 = acc1.to_vector<uint8>(scale);
+      aie::store_v(output, o1);
+      output += iw * 8; // Shift to next oc/8 offset for left side
+
+      acc1 = aie::zeros<acc32, 32>();
+
+      // Shift back to beginning of input
+      for (int i = kernel_height_start; i < kernel_height_end; i++)
+        chess_loop_range(2, ) { line[i] -= (input_channels / 8) * (iw * 8); }
+
+    } // for(int oc=0; oc<(output_channels/8); oc++) {
+
+    // Reset output to beginning, then add 4*8
+    // Reset wts to beginning of wts
+    // Reset line to beginning of input, then add 4*8
+    output -= (output_channels / 8) * (iw * 8) - 32;
+    for (int i = kernel_height_start; i < kernel_height_end; i++)
+      chess_loop_range(2, ) {
+        wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                      kernel_width * kernel_height *
+                      64; // kernel_width*kernel_height*8*8
+        // line[i]    -= (output_channels/8)*(input_channels/8)*(iw*8)-32; //
+        line[i] += 32;
+      }
+  }
+
+  // --------------------------------------------------------------------
+  // Middle pattern
+  // --------------------------------------------------------------------
+  // The middle seciton algorithm is different because we want to minimize
+  // the reloading of weights and activations. So instead, we use up to 8
+  // accumulators to store partial products with activations being shifted.
+  // Then for the next kernel position, we reload weights.
+  //
+  // H,W,C8
+  // --------------------------------------------------------------------
+
+  // Main loop for when input_width/4-2 > 8
+  if (iw_32 > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+        for (int ic = 0; ic < (input_channels / 8); ic++)
+          chess_loop_range(2, ) {
+            for (int i = kernel_height_start; i < kernel_height_end; i++)
+              chess_prepare_for_pipelining chess_loop_range(2, ) { // 1 to 3
+
+                for (int j = 0; j < kernel_width; j++)
+                  chess_loop_range(3, 3) // TODO Assume 3x3
+                      chess_unroll_loop() {
+                    aie::vector<int8, 64> wtsVec = aie::load_v<64>(wtsLine[i]);
+                    wtsLine[i] += 64;
+
+                    // auto prev = prev_a[i].extract<32>(1);
+                    // prev
+                    // = x0..x3(ci0..ci7)
+                    auto prev = aie::load_v<32>((line[i] - 32));
+                    auto curr = aie::load_v<32>((line[i]));
+                    line[i] += 32;
+                    auto next = aie::load_v<32>((line[i]));
+                    // line[i] += 32;
+
+                    auto tprev = aie::concat(zero32, prev);
+                    auto tmp1 = aie::concat(curr, next);
+
+                    tmp1 = aie::shuffle_up_fill(
+                        tmp1, tprev, 8); // curr      = x3..x6(ci0..ci7)
+
+                    tmp1 = aie::shuffle_down(
+                        tmp1, j * 8); // curr      = x4..x7(ci0..ci7) to
+
+                    // j = 0, 1, 2
+                    int j1 = j + 1;                // 1, 2, 3
+                    int j2 = j + 3 - (j >> 1) * 4; // 3, 4, 1
+                    int lineIncr = (j >> 1) * 32;  // 0, 0, 32
+
+                    for (int x = 0; x < 8; x++)
+                      chess_unroll_loop() chess_loop_range(8, 8) {
+                        // auto tmp1 = aie::concat(curr, next);
+                        // auto tprev = aie::concat(zero32, prev);
+                        // auto tmp2 = aie::shuffle_up_fill(
+                        //     tmp1, tprev, 8); // curr      = x3..x6(ci0..ci7)
+                        // auto tmp3 = aie::shuffle_down(
+                        //     tmp2, j * 8); // curr      = x4..x7(ci0..ci7) to
+                        //                   // x5..x8(ci0..ci7)ss
+
+                        // prev = curr;
+                        // curr = next;
+                        // next = aie::load_v<32>(line[i]);
+
+                        // line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+                        // acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+
+                        acc_tmp[x].mac(tmp1.extract<32>(0), wtsVec);
+
+                        tmp1 = aie::shuffle_down(tmp1, j1 * 8);
+                        tmp1.insert(1, aie::load_v<32>(line[i] + lineIncr));
+                        line[i] += 32;
+                        tmp1 = aie::shuffle_down(tmp1, j2 * 8);
+
+                      }             // for(int x=0; x<8; x++)
+                    line[i] -= 320; // (8+2)*32, Reset line buffer ptr to
+                                    // beginning of line (after first 4)
+                  }                 // for(int j=0; j<kernel_width;j++) {
+                wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                               64);  // Move to next ic/8 position
+                line[i] += (iw * 8); // Increment to next ic/8 position (reset
+                                     // at end of outermost loop)
+
+              } // for(int i=kernel_height_start; i<kernel_height_end; i++) { //
+                // 1 to 3
+          }     // for(int ic=0; ic<(input_channels/8); ic++) {
+        for (int x = 0; x < 8; x++)
+          chess_unroll_loop() chess_loop_range(8, 8) {
+            aie::vector<uint8, 32> o1 = acc_tmp[x].to_vector<uint8>(scale);
+            aie::store_v(output, o1);
+            output += 32;
+            acc_tmp[x] = aie::zeros<acc32, 32>();
+          }
+        // For next 8 activations, reset line buffer and weights
+        for (int i = kernel_height_start; i < kernel_height_end; i++)
+          chess_prepare_for_pipelining chess_loop_range(2, ) {
+            line[i] -=
+                (input_channels / 8) * (iw * 8); // length of act to shift back
+          }
+      } // for(int iw_32c=0; iw_32c<iw_32; iw_32c++) {
+      output +=
+          (iw_32_rem * 32 +
+           32); // Shift past remainder output and left section of next oc/8
+    }           //     for(int oc=0; oc<(output_channels/8); oc++) {
+
+    // Reset weights and line buffers for last section of middle (or right side
+    // it there is no last section)
+    for (int i = kernel_height_start; i < kernel_height_end; i++)
+      chess_prepare_for_pipelining chess_loop_range(2, ) {
+        wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                      kernel_width * kernel_height *
+                      64; // kernel_width*kernel_height*8*8
+        // TODO line already shifted back to next data
+        line[i] += iw_32 * 256; // 8*4*8, shift to beginnign of secondary loop }
+      }
+    output -= (output_channels / 8) * (iw * 8) - (iw_32 * 32); // 32 = 4*8
+
+  } // if(iw_32 > 0)
+
+  // Secondary loop for input_width remainder (iw_32_rem < 8)
+  if (iw_32_rem > 0) {
+
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++)
+        chess_loop_range(2, ) {
+          for (int i = kernel_height_start; i < kernel_height_end; i++)
+            chess_prepare_for_pipelining chess_loop_range(2, ) { // 1 to 3
+              for (int j = 0; j < kernel_width; j++)
+                chess_loop_range(3, 3) // TODO Assume 3x3
+                    chess_unroll_loop() {
+                  // New weight every kernel_width
+                  aie::vector<int8, 64> wtsVec = aie::load_v<64>(wtsLine[i]);
+                  wtsLine[i] += 64;
+                  // auto prev = prev_a[i].extract<32>(1);                  //
+                  // prev = x0..x3(ci0..ci7)
+                  auto prev = aie::load_v<32>((line[i] - 32));
+                  auto curr = aie::load_v<32>((line[i]));
+                  line[i] += 32;
+                  auto next = aie::load_v<32>((line[i]));
+                  // line[i] += 32;
+
+                  auto tprev = aie::concat(zero32, prev);
+                  auto tmp1 = aie::concat(curr, next);
+
+                  // j = 0, 1, 2
+                  int jr0 = (2 - j) >> 1;          // 1, 0, 0
+                  int j0 = (j >> 1);               // 0, 0, 1
+                  int j1 = j + 1;                  // 1, 2, 3
+                  int j2 = j + 3 - ((j >> 1) * 4); // 3, 4, 1
+                  int lineIncr = (j >> 1) * 32;    // 0, 0, 32
+
+                  tmp1 = aie::shuffle_up_fill(
+                      tmp1, tprev, jr0 * 8); // curr      = x3..x6(ci0..ci7)
+
+                  tmp1 = aie::shuffle_down(
+                      tmp1, j0 * 8); // curr      = x4..x7(ci0..ci7) to
+
+                  for (int x = 0; x < iw_32_rem; x++) // remainder input width <
+                                                      // 8 chess_unroll_loop()
+                    chess_unroll_loop() {
+                      // auto tmp1 = aie::concat(curr, next);
+                      // auto tprev = aie::concat(zero32, prev);
+                      // auto tmp2 = aie::shuffle_up_fill(
+                      //     tmp1, tprev, 8); // curr      = x3..x6(ci0..ci7)
+                      // auto tmp3 = aie::shuffle_down(
+                      //     tmp2,
+                      //     j * 8); // curr      = x3..x6(ci0..ci7) to
+                      //     x5..x8(ci0..ci7)ss
+
+                      // prev = curr;
+                      // curr = next;
+                      // next = aie::load_v<32>(line[i]);
+                      // line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+                      // acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+                      acc_tmp[x].mac(tmp1.extract<32>(0), wtsVec);
+
+                      tmp1 = aie::shuffle_down(tmp1, j1 * 8);
+                      tmp1.insert(1, aie::load_v<32>(line[i] + lineIncr));
+                      line[i] += 32;
+                      tmp1 = aie::shuffle_down(tmp1, j2 * 8);
+                    }
+                  line[i] -= (iw_32_rem + 1) *
+                             32; // Reset line buffer ptr to beginning of
+                  // (iw_32_rem + 2) * 32; // Reset line buffer ptr to beginning
+                  // of line (after first 4)
+                } //  for(int j=0; j<kernel_width;j++)
+              wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                             64);  // Move to next ic/8 position
+              line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+                                   // end of outermost loop)
+            } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+          // For next 8 input channels, line buffer and weights are
+          // automatically incremented to the right offset
+        } // for(int ic=0; ic<(input_channels/8); ic++)
+      // Write output from accumulator
+      for (int x = 0; x < iw_32_rem; x++) {
+        aie::vector<uint8, 32> o1 = acc_tmp[x].to_vector<uint8>(scale);
+        aie::store_v(output, o1);
+        output += 32;
+        acc_tmp[x] = aie::zeros<acc32, 32>(); // Reset accumulators
+      }
+      // Reset line ptr to beginning of input
+      for (int i = kernel_height_start; i < kernel_height_end; i++)
+        chess_prepare_for_pipelining chess_loop_range(2, ) {
+          line[i] -= (input_channels / 8) * (iw * 8);
+        }
+      // Output ptr should be in the right place (next oc/8)
+      output += (iw * 8) - (iw_32_rem * 32); // 32 = 4*8, shift to next oc/8
+    } // for(int oc=0; oc<(output_channels/8); oc++)
+    // Reset weights and line buffers for right side
+    for (int i = kernel_height_start; i < kernel_height_end; i++)
+      chess_prepare_for_pipelining chess_loop_range(2, ) {
+        wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+                      kernel_width * kernel_height *
+                      64; // kernel_width*kernel_height*8*8
+        line[i] +=
+            iw_32_rem * 32; // shift to beginnign of right data, iw_32_rem*4*8
+      }
+    // shift back so we're aligned with beginning of first oc/8 (rightmost 4
+    // data)
+    output -= (output_channels / 8) * (iw * 8) - (iw_32_rem * 32);
+
+  } // if (iw_32_rem > 0) {
+
+  // --------------------------------------------------------------------
+  // Right patterns
+  // --------------------------------------------------------------------
+  //
+  // --------------------------------------------------------------------
+  {
+    MMUL4x8x8 acc1 = aie::zeros<acc32, 32>();
+    for (int oc = 0; oc < (output_channels / 8); oc++) {
+      for (int ic = 0; ic < (input_channels / 8); ic++)
+        chess_loop_range(2, ) {
+          for (int i = kernel_height_start; i < kernel_height_end; i++)
+            chess_prepare_for_pipelining chess_loop_range(2, )
+            // chess_unroll_loop()
+            {
+              // Load next set of data for input A (matrix row), need stride
+              // info or line1/2/3 pointer
+              // TODO, did not store previous so need to load it again
+              // in_a   = aie::load_v<64>(line[i]-32);
+              auto tmp_a1 =
+                  aie::load_v<32>(line[i] - 32); // act 24..27 (ic0..7 for each)
+              auto tmp_a2 =
+                  aie::load_v<32>(line[i]); // act 28..31 (ic0..7 for each)
+              auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+              aie::vector<uint8, 64> tmp_a;
+#ifdef BORDER_REPLICATE
+              tmp_a2 = aie::shuffle_down(tmp_a2, 24);
+              tmp_a.insert<32>(0, tmp_a2);
+#else
+              tmp_a = aie::zeros<uint8, 64>();
+#endif
+              // shift by 32-8 (fill 32 then shift up by 8)
+              in_a =
+                  aie::shuffle_down_fill(in_a, tmp_a, 24); // act 27..31 - - -
+
+              for (int j = 0; j < kernel_width; j++)
+                chess_loop_range(3, 3) chess_unroll_loop() {
+                  auto in_b = aie::load_v<64>(wtsLine[i]);
+                  wtsLine[i] += 64; // wts ic0..7(oc0..7)
+                  acc1.mac(in_a.extract<32>(0), in_b);
+                  // Shift input A by 1 row (1x8) which is by 1 (the 8 is the
+                  // ic=8)
+                  in_a = aie::shuffle_down(in_a, 8);
+                }
+              wtsLine[i] += ((kernel_height - 1) * kernel_width *
+                             64); // Move to next ic/8 position
+              // No need to load next set of weights because next row of weights
+              // immediately follows
+              line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+                                   // end of outermost loop)
+            } // for(int i=kernel_height_start; i<kernel_height_end; i++)
+
+        } // for(int ic=0; ic<(input_channels/8); ic++) {
+
+      // Write output 4 outputs, 8 channels
+      aie::vector<uint8, 32> o1 = acc1.to_vector<uint8>(scale);
+      aie::store_v(output, o1);
+      output += iw * 8; // Shift to next oc/8
+
+      acc1 = aie::zeros<acc32, 32>();
+
+      for (int i = kernel_height_start; i < kernel_height_end; i++)
+        chess_prepare_for_pipelining chess_loop_range(2, ) {
+          line[i] -= (input_channels / 8) *
+                     (iw * 8); // shift back to beginning of this section
+        }
+    } // for(int oc=0; oc<(output_channels/8); oc++) {
+  }
+  event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+                 uint8_t *output, const int32_t input_width,
+                 const int32_t input_channels, const int32_t output_channels,
+                 const int32_t kernel_width, const int32_t kernel_height,
+                 const int32_t check, const int scale,
+                 const int channel_offset) {
+  conv2dk3_i8_scalar(line0, line1, line2, wts, output, input_width,
+                     input_channels, output_channels, kernel_width,
+                     kernel_height, check, scale, channel_offset);
+}
+
+#else // UINT8_ACT
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+                  uint8_t *output, const int32_t input_width,
+                  const int32_t input_channels, const int32_t output_channels,
+                  const int32_t kernel_width, const int32_t kernel_height,
+                  const int32_t check, const int scale,
+                  const int channel_offset) {
+  conv2dk3_ui8_scalar(line0, line1, line2, wts, output, input_width,
+                      input_channels, output_channels, kernel_width,
+                      kernel_height, check, scale, channel_offset);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+                 uint8_t *output, const int32_t input_width,
+                 const int32_t input_channels, const int32_t output_channels,
+                 const int32_t kernel_width, const int32_t kernel_height,
+                 const int32_t check, const int scale,
+                 const int channel_offset) {
+  conv2dk3_i8_vector(line0, line1, line2, wts, output, input_width,
+                     input_channels, output_channels, kernel_width,
+                     kernel_height, check, scale, channel_offset);
+}
+
+#else // UINT8_ACT
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+                  uint8_t *output, const int32_t input_width,
+                  const int32_t input_channels, const int32_t output_channels,
+                  const int32_t kernel_width, const int32_t kernel_height,
+                  const int32_t check, const int scale,
+                  const int channel_offset) {
+  conv2dk3_ui8_vector(line0, line1, line2, wts, output, input_width,
+                      input_channels, output_channels, kernel_width,
+                      kernel_height, check, scale, channel_offset);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+}
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk3.h b/aie_kernels/aie2/conv2dk3.h
new file mode 100755
index 0000000000..61a2f8e698
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk3.h
@@ -0,0 +1,33 @@
+//===- conv2dk3.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK3_H
+#define _CONV2DK3_H
+
+extern "C" {
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+                 uint8_t *output, const int32_t input_width,
+                 const int32_t input_channels, const int32_t output_channels,
+                 const int32_t kernel_width, const int32_t kernel_height,
+                 const int32_t check, const int scale,
+                 const int channel_offset);
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+                  uint8_t *output, const int32_t input_width,
+                  const int32_t input_channels, const int32_t output_channels,
+                  const int32_t kernel_width, const int32_t kernel_height,
+                  const int32_t check, const int scale,
+                  const int channel_offset);
+
+} // extern "C"
+
+#endif
diff --git a/programming_examples/ml/bottleneck/CMakeLists.txt b/programming_examples/ml/bottleneck/CMakeLists.txt
new file mode 100644
index 0000000000..4b897cb29c
--- /dev/null
+++ b/programming_examples/ml/bottleneck/CMakeLists.txt
@@ -0,0 +1,89 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
+set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+find_package(OpenCV REQUIRED)
+message("opencv library paht: ${OpenCV_LIB_PATH}")
+message("opencv libs: ${OpenCV_LIBS}")
+
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
+        EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${XRT_INC_DIR}
+    ${OpenCV_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${OpenCV_LIB_PATH}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+    )
+endif()
diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile
new file mode 100755
index 0000000000..f5c6e4561f
--- /dev/null
+++ b/programming_examples/ml/bottleneck/Makefile
@@ -0,0 +1,40 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+include ../../makefile-common
+
+mlirFileName = aie
+
+all: build/conv2dk1.o build/conv2dk3.o build/conv2dk1_skip.o build/final.xclbin
+
+build/${mlirFileName}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+insts.txt: build/${mlirFileName}.mlir
+	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+
+build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/conv2dk3.o: ../../../aie_kernels/aie2/conv2dk3.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
+
+build/conv2dk1_skip.o: ../../../aie_kernels/aie2/conv2dk1_skip.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/final.xclbin: build/${mlirFileName}.mlir 
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+clean:
+	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log .xclbin sim \
+		chess* *.o insts.txt \
+		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
+
+run_py: 
+	${powershell} python3 test.py
diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md
new file mode 100644
index 0000000000..144b8e36f2
--- /dev/null
+++ b/programming_examples/ml/bottleneck/README.md
@@ -0,0 +1,125 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>The Bottleneck Block</ins>
+## Introduction
+The bottleneck block is a key component in deep neural network architectures, such as ResNet. It is designed to help address the challenge of training very deep networks by reducing the computational cost while maintaining or improving performance. This README provides an overview of the process and considerations for accelerating a single bottleneck block.
+
+
+## Bottleneck Block Overview
+The components and functionality of a standard bottleneck block:
+
+* Identity Mapping: The core idea behind bottleneck blocks is the concept of identity mapping. Traditional neural network layers aim to learn a mapping from input to output. In contrast, a bottleneck block learns a residual mapping, which is the difference between the input and the output. The original input is then added back to this residual mapping to obtain the final output. Mathematically, this can be represented as `output = input+ residual.`
+
+* Convolutional Layers: Bottleneck blocks typically consist of one or more convolutional layers. These layers are responsible for learning features from the input data. Convolutional layers apply filters/kernels to the input feature maps to extract relevant patterns and features. The number of filters, kernel size, and other parameters can vary based on the specific architecture and requirements.
+
+* Activation Function: After each convolutional layer, an activation function is applied to introduce non-linearity into the network. Rectified Linear Unit (ReLU) is commonly used as the activation function due to its simplicity and effectiveness.
+
+* Batch Normalization: Batch normalization is often employed after convolutional layers to stabilize and accelerate the training process. It normalizes the activations of each layer, making optimization more robust and efficient.
+
+* Skip Connection (Identity Shortcut): This is the hallmark of bottleneck blocks. The skip connection directly passes the input from one layer to a later layer without any modification. It provides an alternative, shorter path for gradient flow during training. If the input and output dimensions of the bottleneck block are the same, the skip connection directly adds the input to the output. If the dimensions differ, the skip connection might include a 1x1 convolutional layer to adjust the dimensions accordingly.
+
+* Final Output: The final output of the bottleneck block is obtained by adding the input to the output of the convolutional layers (including any adjustments made to match dimensions via the skip connection).
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: light)" srcset="bottleneck_block.png">
+  <img alt="block" src="bottleneck_block.png"  height="400">
+</picture>
+  <h3 align="center">Bottleneck block has a 1x1 convolution layer for dimension reduction, a 3x3 convolution layer, and a 1x1 convolution layer for dimension restoration.
+  </h3>
+</p>
+
+## Acceleration Techniques
+1. Depth-First Implementation: Spatial architectures provide coarse-grained flexibility that allows for tailoring of the dataflow to optimize data movement. By tailoring the dataflow, we implement depth-first schedule for a bottleneck block  routing the output of one convolutional operation on an AIE core directly to another convolutional operation on a separate AIE core, all without the need to transfer intermediate results off-chip. This approach effectively minimizes the memory footprint associated with intermediate data, mitigating the overhead of costly off-chip accesses leading to increase in the overall performance.
+
+2. Data Layout: Optimize activation and weight layout to enhance memory access patterns and enables effective utilization of AIE parallel processing units, ultimately improving the performance of 2D convolution operations. 
+
+3. Kernel Optimzation: To optimize convolution operations on AIE, we vectorize the code using AIE vector intrinsics. We load 8 elements of the input channel into vector registers using vector load intrinsic. We apply the convolution operation on this loaded data, utilizing for enhanced computational efficiency. To ensure accurate convolution results, particularly at the edges of feature maps, we implement zero-padding to handle boundary conditions. This comprehensive approach optimizes convolution processing on AIE, facilitating efficient and accurate feature extraction in neural network applications. Input is 4x8 matrix corresponding to 4 element of row and 8 input channels.
+
+4. Quantization: We use int8 precision for activationa and weights. At int8 precision, AIE offers the highest compute density with 256 MAC/cycle.  
+
+5. Layer Fused: We perform two levels of fusion. First, we fuse ReLU in convolution using SRS capabilities of AIE. Second, we fuse BatchNorm into convolution weights. 
+
+
+
+## Data Layout
+We need to ensure that the data layout is compatible with efficient SIMD processing and rearrange the input data into a format where contiguous elements represent consecutive X-dimension values for each channel. For more efficient processing, we adopt a channels-last memory ordering, denoted as NYCXC8, to ensure that channels become the densest dimension. Operating on 8 elements simultaneously, we process 8 channels with the same width at once. Subsequently, we traverse the entire width dimension, handling the remaining channels in batches of 8. This process continues row-wise, resulting in our final data layout pattern: NYCXC8. This optimized layout enhances memory access patterns and enables effective utilization of parallel processing units, ultimately improving the performance of 2D convolution operations. This transformation ensures that data can be efficiently loaded into SIMD registers and processed in parallel. 
+
+YCXC8 Input/Output Data Layout:
+
+In the YCXC8 (with N=1) data layout, the data is organized in memory as follows:
+
+* Y: Represents the output feature map dimension.
+* C: Denotes the number of channels.
+* X: Represents the input feature map dimension.
+* C8: Indicates that 8 elements of the input channel are processed together.
+
+OIYXI8O8 Weight Layout:
+
+We align the weight layout as specified: O,I,Y,X,I8,O8, to match the input image processing. We first load the weight tensor, organizing it to match this layout, where dimensions represent: output channels, input channels, kernel height, kernel width, input channel groups of 8, and output channel groups of 8. By aligning the weight layout in this manner, we enable seamless integration with the input data layout, maximizing parallelism and minimizing memory access overhead. 
+
+In the OIYXI8O8 data layout, the data is organized in memory as follows:
+
+* O: Denotes the number of output channels.
+* I: Denotes the number of input channels.
+* Y: Represents the kernel height.
+* X: Represents the kernel weight.
+* I8: Indicates that 8 elements of the input channel are processed together.
+* O8: Indicates that 8 elements of the output channel are processed together.
+
+## Fusing Convolution and Batch Normalization
+
+We assume the BatchNorm layer is fused into Convoluion Layer. Fusing BatchNorm into convolution involves incorporating the normalization step directly into the convolution operation. This is achieved by modifying the weights of the convolutional filters to include the scaling and shifting factors. Specifically, the weights are adjusted such that the convolution operation performs the normalization, scaling, and shifting in a single step.
+
+## Fusing ReLU
+
+Fusing ReLU into the convolution operation can further optimize the implementation by reducing memory bandwidth requirements and computational overhead. ReLU activation function introduces non-linearity by setting negative values to zero and leaving positive values unchanged. Utilize SIMD instructions to efficiently compute ReLU activation in parallel with convolution. After performing the convolution operation, apply ReLU activation function at vector register level. 
+We use `aie::set_rounding()` and `aie::set_saturation()` to set the rounding and saturation modes for the computed results in the accumulator. Seeting round mode `postitive_inf` rounds halfway towards positive infinity while setting saturation to `aie::saturation_mode::saturate` saturation rounds an uint8 range (0, 255). 
+
+```
+::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+```
+After convolution and ReLU fusion, the output data is generate in YCXC8 layout. Ensure that the output data layout is compatible with subsequent layers or processing steps in the neural network architecture.
+
+
+### Benefits of ReLU Fusion:
+
+1. Reduced Memory Bandwidth:
+By fusing ReLU into the convolution operation, unnecessary memory accesses and data transfers associated with separate ReLU computation are eliminated, leading to reduced memory bandwidth requirements.
+
+2. Improved Performance:
+Fusing ReLU reduces the number of instructions executed per element, resulting in improved computational efficiency and overall performance of the convolution operation.
+
+3. Simplified Code Structure:
+Fusing ReLU into the convolution kernel simplifies the code structure and reduces the overhead associated with separate activation function calls, leading to cleaner and more maintainable code.
+
+4. Enhanced Resource Utilization:
+By combining convolution and ReLU operations, computational resources such as CPU cores or SIMD units are utilized more efficiently, maximizing throughput and achieving better resource utilization.
+
+## Compilation
+To compile the design:
+```
+make
+```
+
+To run the design:
+```
+make run_py
+```
+
+### Prerequisites
+To install the dependencies, run the following command:
+```
+pip install -r requirements.txt
+
+```
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
new file mode 100644
index 0000000000..a488ae8ded
--- /dev/null
+++ b/programming_examples/ml/bottleneck/aie2.py
@@ -0,0 +1,639 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.dialects.ext import memref, arith
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.ir import MemRefType, TypeAttr
+
+import sys
+
+# tracing definitions
+trace_sz_in_bytes = 8192
+trace_sz_in_i32s = trace_sz_in_bytes // 4
+enableTrace = False
+
+# Define bottleneck layer sizes
+
+tensorInW = 32
+tensorInH = 32
+tensorInC = 256
+
+tensorL1InC = tensorInC
+tensorL1OutC = tensorL1InC // 4
+
+tensorL2InC = tensorL1OutC
+tensorL2OutC = tensorL2InC
+
+tensorL3InC = tensorL2OutC
+tensorL3OutC = tensorL3InC * 4
+
+
+def bottleneck4AIEs():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def deviceBody():
+
+            # define types
+            uint8_ty = IntegerType.get_unsigned(8)
+            int8_ty = IntegerType.get_signless(8)
+            int16_ty = IntegerType.get_signless(16)
+            int32_ty = IntegerType.get_signless(32)
+
+            tensorLayer1In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL1InC,
+                ),
+                int8_ty,
+            )
+            weightsLayer1_ty = MemRefType.get((tensorL1InC * tensorL1OutC,), int8_ty)
+            tensorLayer1Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL1OutC,
+                ),
+                uint8_ty,
+            )
+
+            tensorLayer2In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL2InC,
+                ),
+                uint8_ty,
+            )
+            weightsLayer2_ty = MemRefType.get(
+                (3 * 3 * tensorL2InC * tensorL2OutC,), int8_ty
+            )
+            tensorLayer2Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL2OutC // 2,
+                ),
+                uint8_ty,
+            )
+
+            tensorLayer3In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL3InC // 2,
+                ),
+                uint8_ty,
+            )
+            weightsLayer3_ty = MemRefType.get((tensorL3InC * tensorL3OutC,), int8_ty)
+            tensorLayer3Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL3OutC,
+                ),
+                uint8_ty,
+            )
+
+            allWeights_ty = MemRefType.get(
+                (
+                    tensorL1InC * tensorL1OutC
+                    + 3 * 3 * tensorL2InC * tensorL2OutC
+                    + tensorL3InC * tensorL3OutC,
+                ),
+                int8_ty,
+            )
+
+            # kernel definitions
+            conv2dk1 = external_func(
+                "conv2dk1_i8",
+                inputs=[
+                    tensorLayer1In_ty,
+                    weightsLayer1_ty,
+                    tensorLayer1Out_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+            conv2dk3 = external_func(
+                "conv2dk3_ui8",
+                inputs=[
+                    tensorLayer2In_ty,
+                    tensorLayer2In_ty,
+                    tensorLayer2In_ty,
+                    weightsLayer2_ty,
+                    tensorLayer2Out_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+            conv2dk1_skip = external_func(
+                "conv2dk1_skip_i8",
+                inputs=[
+                    tensorLayer3In_ty,
+                    tensorLayer3In_ty,
+                    weightsLayer3_ty,
+                    tensorLayer3Out_ty,
+                    tensorLayer1In_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+
+            ShimTile = tile(0, 0)
+            MemTile = tile(0, 1)
+            ComputeTile2 = tile(0, 2)
+            ComputeTile3 = tile(0, 3)
+            ComputeTile4 = tile(0, 4)
+            ComputeTile5 = tile(0, 5)
+
+            if enableTrace:
+                flow(ComputeTile4, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # runtime parameters
+
+            rtpComputeTile2 = Buffer(ComputeTile2, [16], T.i32(), "rtpComputeTile2")
+            rtpComputeTile3 = Buffer(ComputeTile3, [16], T.i32(), "rtpComputeTile3")
+            rtpComputeTile4 = Buffer(ComputeTile4, [16], T.i32(), "rtpComputeTile4")
+            rtpComputeTile5 = Buffer(ComputeTile5, [16], T.i32(), "rtpComputeTile5")
+
+            # set up data movement with OFs
+            # input tensor (with broadcast for skip connection)
+            of_inOF_act_L3L2 = object_fifo(
+                "inOF_act_L3L2",
+                ShimTile,
+                [ComputeTile2, MemTile],
+                [2, 2, 4],
+                tensorLayer1In_ty,
+            )
+            of_skip_buf = object_fifo(
+                "skip_buf", MemTile, ComputeTile4, 2, tensorLayer1In_ty
+            )
+            object_fifo_link(of_inOF_act_L3L2, of_skip_buf)
+
+            # weights
+            inOF_wts_0_L3L2 = object_fifo(
+                "inOF_wts_0_L3L2", ShimTile, MemTile, 1, allWeights_ty
+            )
+            of_wts_buf_00 = object_fifo(
+                "wts_buf_00", MemTile, ComputeTile2, 1, weightsLayer1_ty
+            )
+            wts_buf_01 = object_fifo(
+                "wts_buf_01",
+                MemTile,
+                [ComputeTile3, ComputeTile5],
+                1,
+                weightsLayer2_ty,
+            )
+            wts_buf_02 = object_fifo(
+                "wts_buf_02", MemTile, ComputeTile4, 1, weightsLayer3_ty
+            )
+            object_fifo_link(inOF_wts_0_L3L2, [of_wts_buf_00, wts_buf_01, wts_buf_02])
+
+            # activation tensor
+            of_act_2_3_5 = object_fifo(
+                "act_2_3_5",
+                ComputeTile2,
+                [ComputeTile3, ComputeTile5],
+                [2, 4, 4],
+                tensorLayer1Out_ty,
+            )  # 1x1 -> 3x3
+            act_3_4 = object_fifo(
+                "act_3_4", ComputeTile3, ComputeTile4, 2, tensorLayer2Out_ty
+            )  # 3x3 -> 1x1
+            act_5_4 = object_fifo(
+                "act_5_4", ComputeTile5, ComputeTile4, 2, tensorLayer2Out_ty
+            )  # 3x3 -> 1x1
+
+            # output tensor
+            outOFL2L3 = object_fifo(
+                "outOFL2L3", ComputeTile4, ShimTile, 2, tensorLayer3Out_ty
+            )
+
+            # 1x1 conv2d
+            @core(ComputeTile2, "conv2dk1.o")
+            def core_body():
+                for _ in for_(sys.maxsize):
+
+                    # acquire weights once
+                    element0Weights = of_wts_buf_00.acquire(ObjectFifoPort.Consume, 1)
+                    scale = memref.load(rtpComputeTile2, [0])
+                    for _ in for_(tensorInH):
+                        element0ActivactionsIn = of_inOF_act_L3L2.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        element0ActivactionsOut = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk1,
+                            [
+                                element0ActivactionsIn,
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL1InC,
+                                tensorL1OutC,
+                                scale,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "inOF_act_L3L2", 1)
+
+                        objectfifo_release(ObjectFifoPort.Produce, "act_2_3_5", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_00", 1)
+                    yield_([])
+
+            # 3x3 conv2d OFM 0-31
+            @core(ComputeTile3, "conv2dk3.o")
+            def core_body():
+                scale = 11
+                for _ in for_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
+                    # scale = memref.load(rtpComputeTile3, 0)
+
+                    # pre-amble: top row
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            0,
+                            scale,
+                            0,
+                        ],
+                    )
+                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+
+                    # middle
+                    for _ in for_(tensorInH - 2):
+                        elementActivactionsIn = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Consume, 3
+                        )
+                        element0ActivactionsOut = act_3_4.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk3,
+                            [
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
+                                elementActivactionsIn[2],
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL2InC,
+                                tensorL2OutC,
+                                3,
+                                3,
+                                1,
+                                scale,
+                                0,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+                        yield_([])
+
+                    # last part
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            2,
+                            scale,
+                            0,
+                        ],
+                    )
+
+                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
+                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
+                    yield_([])
+
+            # 3x3 conv2d OFM 32-63
+            @core(ComputeTile5, "conv2dk3.o")
+            def core_body():
+                scale = 11
+                for _ in for_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
+                    # scale = memref.load(rtpComputeTile5, 0)
+
+                    # pre-amble: top row
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            0,
+                            scale,
+                            tensorL2OutC // 2,
+                        ],
+                    )
+
+                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+
+                    # middle
+                    for _ in for_(tensorInH - 2):
+                        elementActivactionsIn = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Consume, 3
+                        )
+                        element0ActivactionsOut = act_5_4.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk3,
+                            [
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
+                                elementActivactionsIn[2],
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL2InC,
+                                tensorL2OutC,
+                                3,
+                                3,
+                                1,
+                                scale,
+                                tensorL2OutC // 2,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+                        yield_([])
+
+                    # last part
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            2,
+                            scale,
+                            tensorL2OutC // 2,
+                        ],
+                    )
+                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
+                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
+                    yield_([])
+
+            # # 1x1 conv2d and add skip
+            @core(ComputeTile4, "conv2dk1_skip.o")
+            def core_body():
+                for _ in for_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_02.acquire(ObjectFifoPort.Consume, 1)
+                    scale = memref.load(rtpComputeTile4, [0])
+                    skipScale = memref.load(rtpComputeTile4, [1])
+
+                    for _ in for_(tensorInH):
+                        element0ActivactionsIn = act_3_4.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        element1ActivactionsIn = act_5_4.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        elementSkipsIn = of_skip_buf.acquire(ObjectFifoPort.Consume, 1)
+                        elementActivactionsOut = outOFL2L3.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+
+                        call(
+                            conv2dk1_skip,
+                            [
+                                element0ActivactionsIn,
+                                element1ActivactionsIn,
+                                element0Weights,
+                                elementActivactionsOut,
+                                elementSkipsIn,
+                                tensorInW,
+                                tensorL3InC,
+                                tensorL3OutC,
+                                scale,
+                                skipScale,
+                            ],
+                        )
+                        objectfifo_release(ObjectFifoPort.Produce, "outOFL2L3", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "act_3_4", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "act_5_4", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "skip_buf", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_02", 1)
+                    yield_([])
+
+            # instruction stream generation
+            activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4
+            acitivationsOutSize32b = activationsInSize32b
+            totalWeightsSize32b = (
+                tensorL1InC * tensorL1OutC
+                + 3 * 3 * tensorL2InC * tensorL2OutC
+                + tensorL3InC * tensorL3OutC
+            ) // 4
+
+            activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty)
+            weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty)
+
+            @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
+            def sequence(inputFromL3, weightsFromL3, outputToL3):
+
+                if enableTrace:
+                    # Trace output
+
+                    # Trace_Event0, Trace_Event1: Select which events to trace.
+                    # Note that the event buffers only appear to be transferred to DDR in
+                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
+                    # see trace output, or only see it on the next iteration of your
+                    # kernel invocation, as the buffer gets filled up. Note that, even
+                    # though events are encoded as 4 byte words, it may take more than 64
+                    # events to fill the buffer to 256 bytes and cause a flush, since
+                    # multiple repeating events can be 'compressed' by the trace mechanism.
+                    # In order to always generate sufficient events, we add the "assert
+                    # TRUE" event to one slot, which fires every cycle, and thus fills our
+                    # buffer quickly.
+
+                    # Some events:
+                    # TRUE                       (0x01)
+                    # STREAM_STALL               (0x18)
+                    # LOCK_STALL                 (0x1A)
+                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
+                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
+                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock .acquire instruction
+                    # INSTR_LOCK_.release_REQ     (0x2D)  Core executes a lock .release instruction
+                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+                    # Trace_Event0  (4 slots)
+                    ipu_write32(0, 4, 0x340E0, 0x4B222125)
+                    # Trace_Event1  (4 slots)
+                    ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F)
+
+                    # Event slots as configured above:
+                    # 0: Kernel executes vector instruction
+                    # 1: Event 0 -- Kernel starts
+                    # 2: Event 1 -- Kernel done
+                    # 3: Port_Running_0
+                    # 4: Port_Running_1
+                    # 5: Lock Stall
+                    # 6: Lock .acquire Instr
+                    # 7: Lock .release Instr
+
+                    # Stream_Switch_Event_Port_Selection_0
+                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
+                    ipu_write32(0, 4, 0x3FF00, 0x121)
+
+                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+                    ipu_write32(0, 4, 0x340D0, 0x10000)
+
+                    # Start trace copy out.
+                    ipu_writebd_shimtile(
+                        bd_id=3,
+                        buffer_length=trace_sz_in_i32s,
+                        buffer_offset=acitivationsOutSize32b,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_stepsize=0,
+                        d0_wrap=0,
+                        d1_stepsize=0,
+                        d1_wrap=0,
+                        d2_stepsize=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_stepsize=0,
+                        iteration_wrap=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    ipu_write32(0, 2, 0x1D20C, 0x3)
+
+                # write RTP parameters
+                IpuWriteRTPOp(
+                    "rtpComputeTile2", col=0, row=2, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile3", col=0, row=3, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile5", col=0, row=5, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile4", col=0, row=4, index=0, value=1
+                )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
+                IpuWriteRTPOp(
+                    "rtpComputeTile4", col=0, row=4, index=1, value=0
+                )  # skip_scale
+
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_act_L3L2",
+                    bd_id=0,
+                    mem=inputFromL3,
+                    sizes=[1, 1, 1, activationsInSize32b],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="outOFL2L3",
+                    bd_id=2,
+                    mem=outputToL3,
+                    sizes=[1, 1, 1, acitivationsOutSize32b],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_wts_0_L3L2",
+                    bd_id=1,
+                    mem=weightsFromL3,
+                    sizes=[1, 1, 1, totalWeightsSize32b],
+                )
+
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+bottleneck4AIEs()
diff --git a/programming_examples/ml/bottleneck/bottleneck_block.png b/programming_examples/ml/bottleneck/bottleneck_block.png
new file mode 100644
index 0000000000..d5e88bbbd1
Binary files /dev/null and b/programming_examples/ml/bottleneck/bottleneck_block.png differ
diff --git a/programming_examples/ml/bottleneck/requirements.txt b/programming_examples/ml/bottleneck/requirements.txt
new file mode 100644
index 0000000000..08ed5eeb4b
--- /dev/null
+++ b/programming_examples/ml/bottleneck/requirements.txt
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit
new file mode 100644
index 0000000000..ec30002c97
--- /dev/null
+++ b/programming_examples/ml/bottleneck/run.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess, torch
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DUINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk3.cc -o conv2dk3.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py
new file mode 100644
index 0000000000..34f6347175
--- /dev/null
+++ b/programming_examples/ml/bottleneck/test.py
@@ -0,0 +1,190 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import torch
+import torch.nn as nn
+import sys
+import math
+from aie.utils.ml import DataShaper
+import time
+import os
+import numpy as np
+from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+
+torch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+design = "bottleneck_int8"
+xclbin_path = os.path.abspath("build/final.xclbin")
+insts_path = os.path.abspath("build/insts.txt")
+
+log_folder = "log/"
+if not os.path.exists(log_folder):
+    os.makedirs(log_folder)
+
+num_iter = 1
+npu_time_total = 0
+npu_time_min = 9999999
+npu_time_max = 0
+trace_size = 16384
+enable_trace = False
+trace_file = "log/trace_" + design + ".txt"
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+dtype_in = np.dtype("int8")
+dtype_wts = np.dtype("int8")
+dtype_out = np.dtype("uint8")
+
+shape_in_act = (32, 32, 32, 8)
+shape_in_wts1 = (8, 32, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+shape_in_wts2 = (8, 8, 3, 3, 8, 8)  # out,in,ky,kx,in8,out8
+shape_in_wts3 = (32, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+shape_total_wts = (69632, 1)
+shape_out = (32, 32, 32, 8)
+
+# ------------------------------------------------------
+# Initialize activation, weights, scaling factor for int8 model
+# ------------------------------------------------------
+int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor)
+int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor)
+int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor)
+int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+inp_scale1 = 0.5
+inp_scale2 = 0.5
+inp_scale3 = 0.5
+inp_scale4 = 0.5
+
+weight_scale1 = 0.5
+weight_scale2 = 0.5
+weight_scale3 = 0.5
+
+combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2)
+combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3)
+combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1)
+combined_scale4 = -math.log2(inp_scale1 / inp_scale4)
+conv_scale = 0.0039  # scale to convert int8 output to floating point
+relu_scale = 0.0078  # scale to convert int8 output to floating point
+min = 0
+max = 255
+
+# ------------------------------------------------------
+# Get device, load the xclbin & kernel and register them
+# ------------------------------------------------------
+app = setup_aie(
+    xclbin_path,
+    insts_path,
+    shape_in_act,
+    dtype_in,
+    shape_total_wts,
+    dtype_wts,
+    shape_out,
+    dtype_out,
+    enable_trace=enable_trace,
+    trace_size=trace_size,
+)
+
+
+# ------------------------------------------------------
+# Define your golden reference
+# ------------------------------------------------------
+class bottleneck_int8(nn.Module):
+    def __init__(self, in_planes=256, planes=64):
+        super(bottleneck_int8, self).__init__()
+        self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv2d(
+            64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+        self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False)
+
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.relu3 = nn.ReLU()
+
+    def forward(self, x):
+        conv1_out = self.conv1(x) * inp_scale1 * weight_scale1
+        relu1_out = torch.clamp(
+            torch.round(self.relu1(conv1_out) / inp_scale2), min, max
+        )  # convert to int and apply relu
+        conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2
+        relu2_out = torch.clamp(
+            torch.round(self.relu2(conv2_out) / inp_scale3), min, max
+        )
+        conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
+        same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127)
+        skip_add = inp_scale1 * (same_scale_init + int_inp)
+        final_out = inp_scale4 * (
+            torch.clamp(torch.round(skip_add / inp_scale4), min, max)
+        )
+        return final_out
+
+
+# ------------------------------------------------------
+# Pytorch baseline
+# ------------------------------------------------------
+model = bottleneck_int8()
+model.eval()
+model.conv1.weight.data.copy_(int_weight1)
+model.conv2.weight.data.copy_(int_weight2)
+model.conv3.weight.data.copy_(int_weight3)
+
+golden_output = model(int_inp)
+
+# ------------------------------------------------------
+# Reorder input data-layout
+# ------------------------------------------------------
+ds = DataShaper()
+before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+
+total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
+total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+# ------------------------------------------------------
+# Main run loop
+# ------------------------------------------------------
+for i in range(num_iter):
+    start = time.time_ns()
+    aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
+    stop = time.time_ns()
+
+    if enable_trace:
+        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+        write_out_trace(trace, trace_file)
+
+    npu_time = stop - start
+    npu_time_total = npu_time_total + npu_time
+
+# ------------------------------------------------------
+# Reorder output data-layout
+# ------------------------------------------------------
+temp_out = aie_output.reshape(32, 32, 32, 8)
+temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+# ------------------------------------------------------
+# Compare the AIE output and the golden reference
+# ------------------------------------------------------
+print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+assert np.allclose(
+    ofm_mem_fmt_out.detach().numpy(),
+    golden_output.detach().numpy(),
+    rtol=0,
+    atol=inp_scale4,
+)
+
+print("\nPASS!\n")
diff --git a/programming_examples/ml/conv2d/CMakeLists.txt b/programming_examples/ml/conv2d/CMakeLists.txt
new file mode 100644
index 0000000000..4b897cb29c
--- /dev/null
+++ b/programming_examples/ml/conv2d/CMakeLists.txt
@@ -0,0 +1,89 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
+set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+find_package(OpenCV REQUIRED)
+message("opencv library paht: ${OpenCV_LIB_PATH}")
+message("opencv libs: ${OpenCV_LIBS}")
+
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
+        EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${XRT_INC_DIR}
+    ${OpenCV_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${OpenCV_LIB_PATH}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+    )
+endif()
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
new file mode 100755
index 0000000000..0274f3fef7
--- /dev/null
+++ b/programming_examples/ml/conv2d/Makefile
@@ -0,0 +1,35 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+include ../../makefile-common
+
+mlirFileName = aieWithTrace_1core
+
+all: build/conv2dk1_i8.o build/final.xclbin
+
+
+build/${mlirFileName}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+
+insts.txt: build/${mlirFileName}.mlir
+	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+
+build/conv2dk1_i8.o: ../../../aie_kernels/aie2/conv2dk1_i8.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/final.xclbin: build/${mlirFileName}.mlir 
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+clean:
+	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \
+		chess* *.o insts.txt \
+		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
+run_py: 
+	${powershell} python3 test.py
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md
new file mode 100644
index 0000000000..81b25f3e52
--- /dev/null
+++ b/programming_examples/ml/conv2d/README.md
@@ -0,0 +1,67 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Convolution 2D </ins>
+## Introduction
+Convolution is a crucial part of various machine learning and computer vision tasks, such as image recognition, object detection, and image segmentation.  This README provides instructions for implementing convolution on AI Engine. 
+
+At its core, it is a mathematical operation that combines an input image and a filter to produce an output image. The input data is represented as a multi-dimensional matrix, such as an image with height, width, and channels (e.g., RGB channels). The filter is also represented as a multi-dimensional matrix with filter height, width, input and output channels (the same number of channels as the input data). The filter is systematically applied to different regions of the input data. At each step, the filter is element-wise multiplied with the overlapping region of the input data. The element-wise products are summed up to produce a single value, which represents the result of the convolution operation for that region. This process is repeated for all possible regions of the input data, producing an output matrix called the feature map.
+
+The process of applying the filter to different regions of the input data is often visualized as a sliding window moving across the input data. The size of the sliding window corresponds to the size of the filter, and it moves with a certain stride (the number of pixels it moves at each step). The convolution operation consists of seven nested loops, iterating over the input height, input lenght, input channel, output channel, filter height, filter length, and the batch size, each loop corresponding to different aspect of the operation. This systematic process extracts features from the input image, yielding the output feature map, illustrating the computational intricacies of convolution. 
+
+## Acceleration Techniques
+1. Kernel Optimzation: To optimize convolution operations on AIE, we vectorize the code using AIE vector intrinsics. We load 8 elements of the input channel into vector registers using vector load intrinsic. We apply the convolution operation on this loaded data, utilizing for enhanced computational efficiency. To ensure accurate convolution results, particularly at the edges of feature maps, we implement zero-padding to handle boundary conditions. This comprehensive approach optimizes convolution processing on AIE, facilitating efficient and accurate feature extraction in neural network applications. Input is 4x8 matrix corresponding to 4 element of row and 8 input channels.
+
+2. Quantization: We use int8 precision for activationa and weights. At int8 precision, AIE offers the highest compute density with 256 MAC/cycle.  
+
+3. Data Layout: Optimize activation and weight layout to enhance memory access patterns and enables effective utilization of AIE parallel processing units, ultimately improving the performance of 2D convolution operations. 
+
+## Data Layout
+We need to ensure that the data layout is compatible with efficient SIMD processing and rearrange the input data into a format where contiguous elements represent consecutive X-dimension values for each channel. For more efficient processing, we adopt a channels-last memory ordering, denoted as NYCXC8, to ensure that channels become the densest dimension. Operating on 8 elements simultaneously, we process 8 channels with the same width at once. Subsequently, we traverse the entire width dimension, handling the remaining channels in batches of 8. This process continues row-wise, resulting in our final data layout pattern: NYCXC8. This optimized layout enhances memory access patterns and enables effective utilization of parallel processing units, ultimately improving the performance of 2D convolution operations. This transformation ensures that data can be efficiently loaded into SIMD registers and processed in parallel. 
+
+YCXC8 Input/Output Data Layout:
+
+In the YCXC8 (with N=1) data layout, the data is organized in memory as follows::
+
+* Y: Represents the output feature map dimension.
+* C: Denotes the number of channels.
+* X: Represents the input feature map dimension.
+* C8: Indicates that 8 elements of the input channel are processed together.
+
+OIYXI8O8 Weight Layout:
+
+We align the weight layout as specified: O,I,Y,X,I8,O8, to match the input image processing. We first load the weight tensor, organizing it to match this layout, where dimensions represent: output channels, input channels, kernel height, kernel width, input channel groups of 8, and output channel groups of 8. By aligning the weight layout in this manner, we enable seamless integration with the input data layout, maximizing parallelism and minimizing memory access overhead. 
+
+In the OIYXI8O8 data layout, the data is organized in memory as follows:
+
+* O: Denotes the number of output channels.
+* I: Denotes the number of input channels.
+* Y: Represents the kernel height.
+* X: Represents the kernel weight.
+* I8: Indicates that 8 elements of the input channel are processed together.
+* O8: Indicates that 8 elements of the output channel are processed together.
+
+## Compilation
+To compile the design:
+```
+make
+```
+
+To run the design:
+```
+make run
+```
+
+### Prerequisites
+To install the dependencies, run the following command:
+```
+pip install -r requirements.txt
+
+```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
new file mode 100644
index 0000000000..74a2c38838
--- /dev/null
+++ b/programming_examples/ml/conv2d/aie2.py
@@ -0,0 +1,263 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+width = 32
+height = 32
+in_channels = 64
+out_channels = 64
+
+if len(sys.argv) == 3:
+    width = int(sys.argv[1])
+    height = int(sys.argv[2])
+
+
+actIn = width * in_channels  # 32*64 = 2048
+bufIn = actIn * 2  # double buffer
+actInInt32s = actIn // 4
+
+weights = in_channels * out_channels
+weightsInInt32s = weights // 4
+
+actOut = width * out_channels  # 32*64 = 2048
+bufOut = actOut * 2  # double buffer
+actOutInt32s = actOut // 4
+
+enableTrace = False
+trace_size = 16384
+traceSizeInInt32s = trace_size // 4
+
+
+def conv2dk1():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+
+            actIn_ty = T.memref(actIn, T.i8())
+            bufIn_ty = T.memref(bufIn, T.i8())
+
+            weights_ty = T.memref(weights, T.i8())
+
+            out_ty = T.memref(actOut, T.i8())
+            bufOut_ty = T.memref(bufOut, T.i8())
+
+            # memRef_3x3_ty = T.memref(3, 3, T.i16())
+
+            ofifo_actIn_ty = TypeAttr.get(ObjectFifoType.get(actIn_ty))
+            ofifo_bufIn_ty = TypeAttr.get(ObjectFifoType.get(bufIn_ty))
+
+            ofifo_weights_ty = TypeAttr.get(ObjectFifoType.get(weights_ty))
+
+            ofifo_out_ty = TypeAttr.get(ObjectFifoType.get(out_ty))
+            ofifo_bufOut_ty = TypeAttr.get(ObjectFifoType.get(bufOut_ty))
+
+            # AIE Core Function declarations
+            conv2dk1_i8 = external_func(
+                "conv2dk1_i8",
+                inputs=[
+                    actIn_ty,
+                    weights_ty,
+                    out_ty,
+                    T.i32(),
+                    T.i32(),
+                    T.i32(),
+                    T.i32(),
+                ],
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            MemTile = tile(0, 1)
+            ComputeTile2 = tile(0, 2)
+            compute_tile2_col, compute_tile2_row = 0, 2
+
+            if enableTrace:
+                flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # AIE-array data movement with object fifos
+            # Input
+            of_inOF_act_L3L2 = object_fifo(
+                "inOF_act_L3L2", ShimTile, MemTile, 2, bufIn_ty
+            )
+            of_act_L2_02 = object_fifo("act_L2_02", MemTile, ComputeTile2, 2, actIn_ty)
+            object_fifo_link(of_inOF_act_L3L2, of_act_L2_02)
+
+            # wts
+            of_inOF_wts_0_L3L2 = object_fifo(
+                "inOF_wts_0_L3L2", ShimTile, [ComputeTile2], 1, weights_ty
+            )
+
+            # Output
+            of_out_02_L2 = object_fifo("out_02_L2", ComputeTile2, [MemTile], 2, out_ty)
+            of_outOFL2L3 = object_fifo("outOFL2L3", MemTile, [ShimTile], 2, bufOut_ty)
+            object_fifo_link(of_out_02_L2, of_outOFL2L3)
+
+            # Set up compute tiles
+
+            rtp2 = Buffer(ComputeTile2, [16], T.i32(), "rtp2")
+
+            # Compute tile 2
+            @core(ComputeTile2, "conv2dk1_i8.o")
+            def core_body():
+                y_dim = 32
+                x_dim = 32
+                ci = 64
+                co = 64
+
+                for _ in for_(0xFFFFFFFF):
+                    elemWts = of_inOF_wts_0_L3L2.acquire(ObjectFifoPort.Consume, 1)
+
+                    scale = memref.load(rtp2, [0])
+                    # scale = memref.load(rtpComputeTile2, [0])
+
+                    for _ in for_(y_dim):
+                        elemIn = of_act_L2_02.acquire(ObjectFifoPort.Consume, 1)
+                        elemOut0 = of_out_02_L2.acquire(ObjectFifoPort.Produce, 1)
+
+                        call(
+                            conv2dk1_i8,
+                            [
+                                elemIn,
+                                elemWts,
+                                elemOut0,
+                                arith.constant(x_dim),
+                                arith.constant(ci),
+                                arith.constant(co),
+                                scale,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_L2_02", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "out_02_L2", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "inOF_wts_0_L3L2", 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+
+            tensorSize = width * height * in_channels
+            tensorSizeInInt32s = tensorSize // 4
+            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+            memRef_wts_ty = T.memref(weightsInInt32s, T.i32())
+            # memRef_16x16_ty = T.memref(16, 16, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
+            def sequence(I, W, O):
+                if enableTrace:
+                    # 0x340D0: Trace Control 0
+                    #          0xAABB---C
+                    #            AA        <- Event to stop trace capture
+                    #              BB      <- Event to start trace capture
+                    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
+                    # Configure so that "Event 1" (always true) causes tracing to start
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340D0,
+                        value=0x00010000,
+                    )
+                    # 0x340D4: Trace Control 1
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340D4,
+                        value=0x00000000,
+                    )
+                    # 0x340E0: Trace Event Group 1  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340E0,
+                        value=0x4B222125,
+                    )
+                    # 0x340E4: Trace Event Group 2  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340E4,
+                        value=0x2D2C1A4F,
+                    )
+
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x3FF00,
+                        value=0x00000121,
+                    )
+
+                    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
+                    # out to host DDR memory
+                    trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
+                    output_size = bufOut
+                    ipu_writebd_shimtile(
+                        bd_id=trace_bd_id,
+                        buffer_length=trace_size,
+                        buffer_offset=output_size,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_size=0,
+                        d0_stride=0,
+                        d1_size=0,
+                        d1_stride=0,
+                        d2_stride=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_size=0,
+                        iteration_stride=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    # Set start BD to our shim bd_Id (3)
+                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+
+                IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10)
+
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_act_L3L2",
+                    bd_id=0,
+                    mem=I,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="outOFL2L3",
+                    bd_id=2,
+                    mem=O,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_wts_0_L3L2",
+                    bd_id=2,
+                    mem=W,
+                    sizes=[1, 1, 1, weightsInInt32s],
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    #    print(ctx.module.operation.verify())
+    print(ctx.module)
+
+
+conv2dk1()
diff --git a/programming_examples/ml/conv2d/requirements.txt b/programming_examples/ml/conv2d/requirements.txt
new file mode 100644
index 0000000000..08ed5eeb4b
--- /dev/null
+++ b/programming_examples/ml/conv2d/requirements.txt
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit
new file mode 100644
index 0000000000..1eeef90b94
--- /dev/null
+++ b/programming_examples/ml/conv2d/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess, torch
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py
new file mode 100644
index 0000000000..1dc847d8fe
--- /dev/null
+++ b/programming_examples/ml/conv2d/test.py
@@ -0,0 +1,149 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import torch
+import torch.nn as nn
+import sys
+import math
+from aie.utils.ml import DataShaper
+import time
+import os
+import numpy as np
+from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+
+torch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+design = "conv2d"
+xclbin_path = os.path.abspath("build/final.xclbin")
+insts_path = os.path.abspath("build/insts.txt")
+
+log_folder = "log/"
+if not os.path.exists(log_folder):
+    os.makedirs(log_folder)
+
+num_iter = 1
+npu_time_total = 0
+npu_time_min = 9999999
+npu_time_max = 0
+trace_size = 16384
+enable_trace = False
+trace_file = "log/trace_" + design + ".txt"
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+dtype_in = np.dtype("int8")
+dtype_wts = np.dtype("int8")
+dtype_out = np.dtype("int8")
+
+shape_total_wts = (4096, 1)
+shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+shape_out = (32, 8, 32, 8)
+
+# ------------------------------------------------------
+# Initialize activation, weights, scaling factor for int8 model
+# ------------------------------------------------------
+int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor)
+int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor)
+conv_scale = 7.6294e-06  # scale to convert int8 output to floating point
+int8_scale = 0.0078  # scale to convert int8 output to floating point
+min = -128
+max = 127
+# ------------------------------------------------------
+# Get device, load the xclbin & kernel and register them
+# ------------------------------------------------------
+app = setup_aie(
+    xclbin_path,
+    insts_path,
+    shape_in_act,
+    dtype_in,
+    shape_total_wts,
+    dtype_wts,
+    shape_out,
+    dtype_out,
+    enable_trace=enable_trace,
+    trace_size=trace_size,
+)
+
+
+# ------------------------------------------------------
+# Define your golden reference
+# ------------------------------------------------------
+class conv2d_int_model(nn.Module):
+    def __init__(self, in_planes=64, planes=64):
+        super(conv2d_int_model, self).__init__()
+        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        out_int = self.conv(x)
+        out_quant = out_int * conv_scale  # int8 x int8 leads to int32 output
+        out_float = int8_scale * torch.clamp(
+            torch.round(out_quant / int8_scale), min, max
+        )  # converting to int8 range
+        return out_float
+
+
+# ------------------------------------------------------
+# Pytorch baseline
+# ------------------------------------------------------
+model = conv2d_int_model()
+model.eval()
+model.conv.weight.data.copy_(int_weight)
+
+golden_output = model(int_inp)
+
+# ------------------------------------------------------
+# Reorder input data-layout
+# ------------------------------------------------------
+ds = DataShaper()
+before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+total_wts = np.concatenate((wts1), axis=None)
+total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+# ------------------------------------------------------
+# Main run loop
+# ------------------------------------------------------
+for i in range(num_iter):
+    start = time.time_ns()
+    aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale
+    stop = time.time_ns()
+
+    if enable_trace:
+        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+        write_out_trace(trace, trace_file)
+
+    npu_time = stop - start
+    npu_time_total = npu_time_total + npu_time
+
+# ------------------------------------------------------
+# Reorder output data-layout
+# ------------------------------------------------------
+temp_out = aie_output.reshape(32, 8, 32, 8)
+temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+# ------------------------------------------------------
+# Compare the AIE output and the golden reference
+# ------------------------------------------------------
+
+print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+assert np.allclose(
+    ofm_mem_fmt_out.detach().numpy(),
+    golden_output.detach().numpy(),
+    rtol=0,
+    atol=2 * int8_scale,
+)
+print("\nPASS!\n")
diff --git a/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt
new file mode 100644
index 0000000000..4b897cb29c
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt
@@ -0,0 +1,89 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
+set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+find_package(OpenCV REQUIRED)
+message("opencv library paht: ${OpenCV_LIB_PATH}")
+message("opencv libs: ${OpenCV_LIBS}")
+
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
+        EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${XRT_INC_DIR}
+    ${OpenCV_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${OpenCV_LIB_PATH}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+    )
+endif()
diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile
new file mode 100755
index 0000000000..80cb34dc08
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/Makefile
@@ -0,0 +1,35 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+include ../../makefile-common
+
+mlirFileName = aieWithTrace_1core
+
+all: build/conv2dk1.o build/final.xclbin
+
+build/${mlirFileName}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+
+insts.txt: build/${mlirFileName}.mlir
+	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+
+build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/final.xclbin: build/${mlirFileName}.mlir 
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+clean:
+	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \
+		chess* *.o insts.txt \
+		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
+
+run_py: 
+	${powershell} python3 test.py
diff --git a/programming_examples/ml/conv2d_fused_relu/README.md b/programming_examples/ml/conv2d_fused_relu/README.md
new file mode 100644
index 0000000000..68e7e9b8cf
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/README.md
@@ -0,0 +1,99 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Convolution with Fused ReLU</ins>
+
+## Introduction
+Convolution is a crucial part of various machine learning and computer vision tasks, such as image recognition, object detection, and image segmentation.  ReLU (Rectified Linear Unit ) is one of the most commonly used activation functions due to its simplicity and effectiveness. This README provides instructions for implementing convolution with ReLU activation function on AI Engine. 
+
+
+At its core, convolution is a mathematical operation that combines an input image and a filter to produce an output image. The input data is represented as a multi-dimensional matrix, such as an image with height, width, and channels (e.g., RGB channels). The filter is also represented as a multi-dimensional matrix with filter height, width, input and output channels (the same number of channels as the input data). The filter is systematically applied to different regions of the input data. At each step, the filter is element-wise multiplied with the overlapping region of the input data. The element-wise products are summed up to produce a single value, which represents the result of the convolution operation for that region. This process is repeated for all possible regions of the input data, producing an output matrix called the feature map.
+
+The process of applying the filter to different regions of the input data is often visualized as a sliding window moving across the input data. The size of the sliding window corresponds to the size of the filter, and it moves with a certain stride (the number of pixels it moves at each step). The convolution operation consists of seven nested loops, iterating over the input height, input lenght, input channel, output channel, filter height, filter length, and the batch size, each loop corresponding to different aspect of the operation. This systematic process extracts features from the input image, yielding the output feature map, illustrating the computational intricacies of convolution. 
+
+## Acceleration Techniques
+1. Kernel Optimzation: To optimize convolution operations on AIE, we vectorize the code using AIE vector intrinsics. We load 8 elements of the input channel into vector registers using vector load intrinsic. We apply the convolution operation on this loaded data, utilizing for enhanced computational efficiency. To ensure accurate convolution results, particularly at the edges of feature maps, we implement zero-padding to handle boundary conditions. This comprehensive approach optimizes convolution processing on AIE, facilitating efficient and accurate feature extraction in neural network applications. Input is 4x8 matrix corresponding to 4 element of row and 8 input channels.
+
+2. Quantization: We use int8 precision for activationa and weights. At int8 precision, AIE offers the highest compute density with 256 MAC/cycle.  
+
+3. Layer Fused: We perform two levels of fusion. First, we fuse ReLU in convolution using SRS capabilities of AIE. Second, we fuse BatchNorm into convolution weights. 
+
+4. Data Layout: Optimize activation and weight layout to enhance memory access patterns and enables effective utilization of AIE parallel processing units, ultimately improving the performance of 2D convolution operations. 
+
+## Data Layout
+We need to ensure that the data layout is compatible with efficient SIMD processing and rearrange the input data into a format where contiguous elements represent consecutive X-dimension values for each channel. For more efficient processing, we adopt a channels-last memory ordering, denoted as NYCXC8, to ensure that channels become the densest dimension. Operating on 8 elements simultaneously, we process 8 channels with the same width at once. Subsequently, we traverse the entire width dimension, handling the remaining channels in batches of 8. This process continues row-wise, resulting in our final data layout pattern: NYCXC8. This optimized layout enhances memory access patterns and enables effective utilization of parallel processing units, ultimately improving the performance of 2D convolution operations. This transformation ensures that data can be efficiently loaded into SIMD registers and processed in parallel. 
+
+YCXC8 Input/Output Data Layout:
+
+In the YCXC8 (with N=1) data layout, the data is organized in memory as follows:
+
+* Y: Represents the output feature map dimension.
+* C: Denotes the number of channels.
+* X: Represents the input feature map dimension.
+* C8: Indicates that 8 elements of the input channel are processed together.
+
+OIYXI8O8 Weight Layout:
+
+We align the weight layout as specified: O,I,Y,X,I8,O8, to match the input image processing. We first load the weight tensor, organizing it to match this layout, where dimensions represent: output channels, input channels, kernel height, kernel width, input channel groups of 8, and output channel groups of 8. By aligning the weight layout in this manner, we enable seamless integration with the input data layout, maximizing parallelism and minimizing memory access overhead. 
+
+In the OIYXI8O8 data layout, the data is organized in memory as follows:
+
+* O: Denotes the number of output channels.
+* I: Denotes the number of input channels.
+* Y: Represents the kernel height.
+* X: Represents the kernel weight.
+* I8: Indicates that 8 elements of the input channel are processed together.
+* O8: Indicates that 8 elements of the output channel are processed together.
+
+
+## Fusing ReLU
+Fusing ReLU into the convolution operation can further optimize the implementation by reducing memory bandwidth requirements and computational overhead. ReLU activation function introduces non-linearity by setting negative values to zero and leaving positive values unchanged. Utilize SIMD instructions to efficiently compute ReLU activation in parallel with convolution. After performing the convolution operation, apply ReLU activation function at vector register level. 
+We use `aie::set_rounding()` and `aie::set_saturation()` to set the rounding and saturation modes for the computed results in the accumulator. Seeting round mode `postitive_inf` rounds halfway towards positive infinity while setting saturation to `aie::saturation_mode::saturate` saturation rounds an uint8 range (0, 255). 
+
+```
+::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+```
+After convolution and ReLU fusion, the output data is generate in YCXC8 layout. Ensure that the output data layout is compatible with subsequent layers or processing steps in the neural network architecture.
+
+
+### Benefits of ReLU Fusion:
+
+1. Reduced Memory Bandwidth:
+By fusing ReLU into the convolution operation, unnecessary memory accesses and data transfers associated with separate ReLU computation are eliminated, leading to reduced memory bandwidth requirements.
+
+2. Improved Performance:
+Fusing ReLU reduces the number of instructions executed per element, resulting in improved computational efficiency and overall performance of the convolution operation.
+
+3. Simplified Code Structure:
+Fusing ReLU into the convolution kernel simplifies the code structure and reduces the overhead associated with separate activation function calls, leading to cleaner and more maintainable code.
+
+4. Enhanced Resource Utilization:
+By combining convolution and ReLU operations, computational resources such as CPU cores or SIMD units are utilized more efficiently, maximizing throughput and achieving better resource utilization.
+
+## Compilation
+To compile the design:
+```
+make
+```
+
+To run the design:
+```
+make run
+```
+
+### Prerequisites
+To install the dependencies, run the following command:
+```
+pip install -r requirements.txt
+
+```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
new file mode 100644
index 0000000000..be0167e3b4
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -0,0 +1,263 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+width = 32
+height = 32
+in_channels = 64
+out_channels = 64
+
+if len(sys.argv) == 3:
+    width = int(sys.argv[1])
+    height = int(sys.argv[2])
+
+
+actIn = width * in_channels  # 32*64 = 2048
+bufIn = actIn * 2  # double buffer
+actInInt32s = actIn // 4
+
+weights = in_channels * out_channels
+weightsInInt32s = weights // 4
+
+actOut = width * out_channels  # 32*64 = 2048
+bufOut = actOut * 2  # double buffer
+actOutInt32s = actOut // 4
+
+enableTrace = False
+trace_size = 16384
+traceSizeInInt32s = trace_size // 4
+
+
+def conv2dk1():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+
+            actIn_ty = T.memref(actIn, T.i8())
+            bufIn_ty = T.memref(bufIn, T.i8())
+
+            weights_ty = T.memref(weights, T.i8())
+
+            out_ty = T.memref(actOut, T.ui8())
+            bufOut_ty = T.memref(bufOut, T.ui8())
+
+            # memRef_3x3_ty = T.memref(3, 3, T.i16())
+
+            ofifo_actIn_ty = TypeAttr.get(ObjectFifoType.get(actIn_ty))
+            ofifo_bufIn_ty = TypeAttr.get(ObjectFifoType.get(bufIn_ty))
+
+            ofifo_weights_ty = TypeAttr.get(ObjectFifoType.get(weights_ty))
+
+            ofifo_out_ty = TypeAttr.get(ObjectFifoType.get(out_ty))
+            ofifo_bufOut_ty = TypeAttr.get(ObjectFifoType.get(bufOut_ty))
+
+            # AIE Core Function declarations
+            conv2dk1_i8 = external_func(
+                "conv2dk1_i8",
+                inputs=[
+                    actIn_ty,
+                    weights_ty,
+                    out_ty,
+                    T.i32(),
+                    T.i32(),
+                    T.i32(),
+                    T.i32(),
+                ],
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            MemTile = tile(0, 1)
+            ComputeTile2 = tile(0, 2)
+            compute_tile2_col, compute_tile2_row = 0, 2
+
+            if enableTrace:
+                flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # AIE-array data movement with object fifos
+            # Input
+            of_inOF_act_L3L2 = object_fifo(
+                "inOF_act_L3L2", ShimTile, MemTile, 2, bufIn_ty
+            )
+            of_act_L2_02 = object_fifo("act_L2_02", MemTile, ComputeTile2, 2, actIn_ty)
+            object_fifo_link(of_inOF_act_L3L2, of_act_L2_02)
+
+            # wts
+            of_inOF_wts_0_L3L2 = object_fifo(
+                "inOF_wts_0_L3L2", ShimTile, [ComputeTile2], 1, weights_ty
+            )
+
+            # Output
+            of_out_02_L2 = object_fifo("out_02_L2", ComputeTile2, [MemTile], 2, out_ty)
+            of_outOFL2L3 = object_fifo("outOFL2L3", MemTile, [ShimTile], 2, bufOut_ty)
+            object_fifo_link(of_out_02_L2, of_outOFL2L3)
+
+            # Set up compute tiles
+
+            rtp2 = Buffer(ComputeTile2, [16], T.i32(), "rtp2")
+
+            # Compute tile 2
+            @core(ComputeTile2, "conv2dk1.o")
+            def core_body():
+                y_dim = 32
+                x_dim = 32
+                ci = 64
+                co = 64
+
+                for _ in for_(0xFFFFFFFF):
+                    elemWts = of_inOF_wts_0_L3L2.acquire(ObjectFifoPort.Consume, 1)
+
+                    scale = memref.load(rtp2, [0])
+                    # scale = memref.load(rtpComputeTile2, [0])
+
+                    for _ in for_(y_dim):
+                        elemIn = of_act_L2_02.acquire(ObjectFifoPort.Consume, 1)
+                        elemOut0 = of_out_02_L2.acquire(ObjectFifoPort.Produce, 1)
+
+                        call(
+                            conv2dk1_i8,
+                            [
+                                elemIn,
+                                elemWts,
+                                elemOut0,
+                                arith.constant(x_dim),
+                                arith.constant(ci),
+                                arith.constant(co),
+                                scale,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_L2_02", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "out_02_L2", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "inOF_wts_0_L3L2", 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+
+            tensorSize = width * height * in_channels
+            tensorSizeInInt32s = tensorSize // 4
+            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+            memRef_wts_ty = T.memref(weightsInInt32s, T.i32())
+            # memRef_16x16_ty = T.memref(16, 16, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
+            def sequence(I, W, O):
+                if enableTrace:
+                    # 0x340D0: Trace Control 0
+                    #          0xAABB---C
+                    #            AA        <- Event to stop trace capture
+                    #              BB      <- Event to start trace capture
+                    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
+                    # Configure so that "Event 1" (always true) causes tracing to start
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340D0,
+                        value=0x00010000,
+                    )
+                    # 0x340D4: Trace Control 1
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340D4,
+                        value=0x00000000,
+                    )
+                    # 0x340E0: Trace Event Group 1  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340E0,
+                        value=0x4B222125,
+                    )
+                    # 0x340E4: Trace Event Group 2  (Which events to trace)
+                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x340E4,
+                        value=0x2D2C1A4F,
+                    )
+
+                    ipu_write32(
+                        column=compute_tile2_col,
+                        row=compute_tile2_row,
+                        address=0x3FF00,
+                        value=0x00000121,
+                    )
+
+                    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
+                    # out to host DDR memory
+                    trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
+                    output_size = bufOut
+                    ipu_writebd_shimtile(
+                        bd_id=trace_bd_id,
+                        buffer_length=trace_size,
+                        buffer_offset=output_size,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_size=0,
+                        d0_stride=0,
+                        d1_size=0,
+                        d1_stride=0,
+                        d2_stride=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_size=0,
+                        iteration_stride=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    # Set start BD to our shim bd_Id (3)
+                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+
+                IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=1)
+
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_act_L3L2",
+                    bd_id=0,
+                    mem=I,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="outOFL2L3",
+                    bd_id=2,
+                    mem=O,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_wts_0_L3L2",
+                    bd_id=2,
+                    mem=W,
+                    sizes=[1, 1, 1, weightsInInt32s],
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    #    print(ctx.module.operation.verify())
+    print(ctx.module)
+
+
+conv2dk1()
diff --git a/programming_examples/ml/conv2d_fused_relu/requirements.txt b/programming_examples/ml/conv2d_fused_relu/requirements.txt
new file mode 100644
index 0000000000..08ed5eeb4b
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/requirements.txt
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit
new file mode 100644
index 0000000000..0c122f451e
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess, torch
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py
new file mode 100644
index 0000000000..5bfe139112
--- /dev/null
+++ b/programming_examples/ml/conv2d_fused_relu/test.py
@@ -0,0 +1,151 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import torch
+import torch.nn as nn
+import sys
+import math
+from aie.utils.ml import DataShaper
+import time
+import os
+import numpy as np
+from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+
+torch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+design = "conv2d_with_relu"
+xclbin_path = os.path.abspath("build/final.xclbin")
+insts_path = os.path.abspath("build/insts.txt")
+
+log_folder = "log/"
+if not os.path.exists(log_folder):
+    os.makedirs(log_folder)
+
+num_iter = 1
+npu_time_total = 0
+npu_time_min = 9999999
+npu_time_max = 0
+trace_size = 16384
+enable_trace = False
+trace_file = "log/trace_" + design + ".txt"
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+dtype_in = np.dtype("int8")
+dtype_wts = np.dtype("int8")
+dtype_out = np.dtype("uint8")
+
+shape_total_wts = (4096, 1)
+shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+shape_out = (32, 8, 32, 8)
+
+# ------------------------------------------------------
+# Initialize activation, weights, scaling factor for int8 model
+# ------------------------------------------------------
+int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor)
+int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor)
+conv_scale = 0.0039  # scale to convert int8 output to floating point
+relu_scale = 0.0078  # scale to convert int8 output to floating point
+min = 0
+max = 255
+
+# ------------------------------------------------------
+# Get device, load the xclbin & kernel and register them
+# ------------------------------------------------------
+app = setup_aie(
+    xclbin_path,
+    insts_path,
+    shape_in_act,
+    dtype_in,
+    shape_total_wts,
+    dtype_wts,
+    shape_out,
+    dtype_out,
+    enable_trace=enable_trace,
+    trace_size=trace_size,
+)
+
+
+# ------------------------------------------------------
+# Define your golden reference
+# ------------------------------------------------------
+class conv2d_relu_int_model(nn.Module):
+    def __init__(self, in_planes=64, planes=64):
+        super(conv2d_relu_int_model, self).__init__()
+        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out_int = self.conv(x)
+        out_float = out_int * conv_scale
+        out_int = self.relu(out_float)
+        out_float = relu_scale * torch.clamp(
+            torch.round(out_int / relu_scale), min, max
+        )  # converting to int to do proper clipping
+        return out_float
+
+
+# ------------------------------------------------------
+# Pytorch baseline
+# ------------------------------------------------------
+model = conv2d_relu_int_model()
+model.eval()
+model.conv.weight.data.copy_(int_weight)
+golden_output = model(int_inp)
+
+# ------------------------------------------------------
+# Reorder input data-layout
+# ------------------------------------------------------
+ds = DataShaper()
+before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+total_wts = np.concatenate((wts1), axis=None)
+total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+# ------------------------------------------------------
+# Main run loop
+# ------------------------------------------------------
+for i in range(num_iter):
+    start = time.time_ns()
+    aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
+    stop = time.time_ns()
+
+    if enable_trace:
+        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+        write_out_trace(trace, trace_file)
+
+    npu_time = stop - start
+    npu_time_total = npu_time_total + npu_time
+
+# ------------------------------------------------------
+# Reorder output data-layout
+# ------------------------------------------------------
+temp_out = aie_output.reshape(32, 8, 32, 8)
+temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+# ------------------------------------------------------
+# Compare the AIE output and the golden reference
+# ------------------------------------------------------
+print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+assert np.allclose(
+    ofm_mem_fmt_out.detach().numpy(),
+    golden_output.detach().numpy(),
+    rtol=0,
+    atol=2 * relu_scale,
+)
+
+print("\nPASS!\n")
diff --git a/programming_examples/ml/resnet/README.md b/programming_examples/ml/resnet/README.md
new file mode 100755
index 0000000000..6382079c62
--- /dev/null
+++ b/programming_examples/ml/resnet/README.md
@@ -0,0 +1,121 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>ResNet with Offloaded Conv2_x Bottleneck Blocks</ins>
+
+## Introduction
+ResNet [[1]](#1) is a convolutional neural network architecture that has gained significant popularity for various computer vision tasks, including image classification, object detection, and image segmentation. It is renowned for its depth and efficiency in training very deep networks.
+
+This README focuses on a specific optimization technique applied to ResNet, specifically targeting the offloading of the conv2_x part of the bottleneck blocks. By offloading computations to dedicated hardware accelerators or specialized processors, we aim to improve the overall efficiency and speed of the network, especially when deploying it on resource-constrained devices or in scenarios where real-time processing is critical.
+
+
+## ResNet Architecture Overview
+ResNet consists of several key components:
+
+1. Input Layer: Accepts input image data with dimensions typically set to 224x224x3 (width, height, RGB channels).
+2. Convolutional Layers: The initial layers perform convolution operations to extract basic features from the input image.
+3. Bottleneck Blocks:
+    * ResNet is composed of multiple bottleneck blocks grouped into different stages (conv2_x, conv3_x, conv4_x, conv5_x).
+    * Each bottleneck block contains convolutional layers and shortcut connections that facilitate the learning of residual mappings.
+    * The conv2_x stage is particularly targeted for offloading computations in this optimization.
+4. Pooling Layers: Max pooling layers reduce the spatial dimensions of the feature maps.
+5. Fully Connected Layer: Produces the final output predictions, typically followed by a softmax activation for classification tasks.
+
+
+## Offloading Conv2_x Bottleneck Blocks
+The conv2_x stage of ResNet comprises a series of bottleneck blocks, each containing convolutional layers responsible for learning more complex features from the input data. By offloading the computations within these blocks to AI Engine, we aim to:
+
+* Reduce the computational burden on the main processing unit (e.g., CPU or GPU).
+* Improve overall inference speed and efficiency, especially in scenarios where real-time processing is crucial.
+* Enable deployment on resource-constrained devices with limited computational resources.
+
+##  Usage and Deployment
+To leverage the optimized ResNet with offloaded conv2_x bottleneck blocks:
+* [IRON Programming](https://github.com/Xilinx/mlir-aie/tree/gagan_asplos_resnet/programming_examples/ml/resnet/layers_conv2_x): Demonstrates the IRON flow for offloading conv2_x to AIE.
+
+
+## Acceleration Techniques
+1. Depth-First/Layer-Fused Implementation: Spatial architectures provide coarse-grained flexibility that allows for tailoring of the dataflow to optimize data movement. By tailoring the dataflow, we implement depth-first schedule for a bottleneck block  routing the output of one convolutional operation on an AIE core directly to another convolutional operation on a separate AIE core, all without the need to transfer intermediate results off-chip. This approach effectively minimizes the memory footprint associated with intermediate data, mitigating the overhead of costly off-chip accesses leading to increase in the overall performance.
+
+
+2. Data Layout: Optimize activation and weight layout to enhance memory access patterns and enables effective utilization of AIE parallel processing units, ultimately improving the performance of 2D convolution operations. 
+
+3. Kernel Optimzation: To optimize convolution operations on AIE, we vectorize the code using AIE vector intrinsics. We load 8 elements of the input channel into vector registers using vector load intrinsic. We apply the convolution operation on this loaded data, utilizing for enhanced computational efficiency. To ensure accurate convolution results, particularly at the edges of feature maps, we implement zero-padding to handle boundary conditions. This comprehensive approach optimizes convolution processing on AIE, facilitating efficient and accurate feature extraction in neural network applications. Input is 4x8 matrix corresponding to 4 element of row and 8 input channels.
+
+4. Quantization: We use int8 precision for activationa and weights. At int8 precision, AIE offers the highest compute density with 256 MAC/cycle.  
+
+5. Layer Fused: We perform two levels of fusion. First, we fuse ReLU in convolution using SRS capabilities of AIE. Second, we fuse BatchNorm into convolution weights. 
+
+
+## Data Layout
+We need to ensure that the data layout is compatible with efficient SIMD processing and rearrange the input data into a format where contiguous elements represent consecutive X-dimension values for each channel. For more efficient processing, we adopt a channels-last memory ordering, denoted as NYCXC8, to ensure that channels become the densest dimension. Operating on 8 elements simultaneously, we process 8 channels with the same width at once. Subsequently, we traverse the entire width dimension, handling the remaining channels in batches of 8. This process continues row-wise, resulting in our final data layout pattern: NYCXC8. This optimized layout enhances memory access patterns and enables effective utilization of parallel processing units, ultimately improving the performance of 2D convolution operations. This transformation ensures that data can be efficiently loaded into SIMD registers and processed in parallel. 
+
+YCXC8 Input/Output Data Layout:
+
+In the YCXC8 (with N=1) data layout, the data is organized in memory as follows:
+
+* Y: Represents the output feature map dimension.
+* C: Denotes the number of channels.
+* X: Represents the input feature map dimension.
+* C8: Indicates that 8 elements of the input channel are processed together.
+
+OIYXI8O8 Weight Layout:
+
+We align the weight layout as specified: O,I,Y,X,I8,O8, to match the input image processing. We first load the weight tensor, organizing it to match this layout, where dimensions represent: output channels, input channels, kernel height, kernel width, input channel groups of 8, and output channel groups of 8. By aligning the weight layout in this manner, we enable seamless integration with the input data layout, maximizing parallelism and minimizing memory access overhead. 
+
+In the OIYXI8O8 data layout, the data is organized in memory as follows:
+
+* O: Denotes the number of output channels.
+* I: Denotes the number of input channels.
+* Y: Represents the kernel height.
+* X: Represents the kernel weight.
+* I8: Indicates that 8 elements of the input channel are processed together.
+* O8: Indicates that 8 elements of the output channel are processed together.
+
+## Fusing Convolution and Batch Normalization
+
+We assume the BatchNorm layer is fused into Convoluion Layer. Fusing BatchNorm into convolution involves incorporating the normalization step directly into the convolution operation. This is achieved by modifying the weights of the convolutional filters to include the scaling and shifting factors. Specifically, the weights are adjusted such that the convolution operation performs the normalization, scaling, and shifting in a single step.
+
+## Fusing ReLU
+
+Fusing ReLU into the convolution operation can further optimize the implementation by reducing memory bandwidth requirements and computational overhead. ReLU activation function introduces non-linearity by setting negative values to zero and leaving positive values unchanged. Utilize SIMD instructions to efficiently compute ReLU activation in parallel with convolution. After performing the convolution operation, apply ReLU activation function at vector register level. 
+We use `aie::set_rounding()` and `aie::set_saturation()` to set the rounding and saturation modes for the computed results in the accumulator. Seeting round mode `postitive_inf` rounds halfway towards positive infinity while setting saturation to `aie::saturation_mode::saturate` saturation rounds an uint8 range (0, 255). 
+
+```
+::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+::aie::set_rounding(
+      aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+```
+After convolution and ReLU fusion, the output data is generate in YCXC8 layout. Ensure that the output data layout is compatible with subsequent layers or processing steps in the neural network architecture.
+
+## Compilation
+To compile the design:
+```
+make
+```
+
+To run the design:
+```
+make run_py
+```
+
+### Prerequisites
+
+To install the dependencies, run the following command:
+```
+pip install -r requirements.txt
+
+```
+
+## References
+<a id="1">[1]</a> 
+He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778).
+
diff --git a/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt b/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt
new file mode 100755
index 0000000000..4b897cb29c
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt
@@ -0,0 +1,89 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
+set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+find_package(OpenCV REQUIRED)
+message("opencv library paht: ${OpenCV_LIB_PATH}")
+message("opencv libs: ${OpenCV_LIBS}")
+
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
+        EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${XRT_INC_DIR}
+    ${OpenCV_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${OpenCV_LIB_PATH}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        ${OpenCV_LIBS}
+    )
+endif()
diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile
new file mode 100755
index 0000000000..2f978a05ba
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile
@@ -0,0 +1,50 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../../makefile-common
+
+mlirFileName = aie
+
+all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o build/final.xclbin
+
+# build/${mlirFileName}.mlir: aie2.py
+# 	mkdir -p ${@D}
+# 	python3 $< > $@
+
+build/${mlirFileName}.mlir: aie.mlir
+	mkdir -p ${@D}
+	cp $< $@
+insts.txt: build/${mlirFileName}.mlir
+	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+
+build/conv2dk1_i8.o: ../../../../aie_kernels/aie2/conv2dk1.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/conv2dk3.o: ../../../../aie_kernels/aie2/conv2dk3.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
+
+build/conv2dk1_skip_init.o: ../../../../aie_kernels/aie2/conv2dk1_skip_init.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/conv2dk1_ui8.o: ../../../../aie_kernels/aie2/conv2dk1.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
+
+build/conv2dk1_skip.o: ../../../../aie_kernels/aie2/conv2dk1_skip.cc
+	xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
+
+build/final.xclbin: build/${mlirFileName}.mlir 
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+clean:
+	rm -rf build *.elf* *.lst *.bif log* ${mlirFileName}.mlir.prj *.xclbin sim \
+		chess* *.o insts.txt \
+		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
+
+run_py: 
+	${powershell} python3 test.py
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
new file mode 100755
index 0000000000..ccc04efb9a
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -0,0 +1,1014 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+module {
+aie.device(ipu) {
+
+  //shim
+  %tile00 = aie.tile(0, 0)
+  %tile10 = aie.tile(1, 0)
+  %tile20 = aie.tile(2, 0)
+
+  //memtiles
+  %tile01 = aie.tile(0, 1)
+  %tile11 = aie.tile(1, 1)
+  %tile21 = aie.tile(2, 1)
+
+  %tile02 = aie.tile(0, 2)
+  %tile03 = aie.tile(0, 3)
+  %tile04 = aie.tile(0, 5)
+  %tile05 = aie.tile(0, 4)
+ 
+  %tile12 = aie.tile(1, 2)
+  %tile13 = aie.tile(1, 3)
+  %tile14 = aie.tile(1, 4)
+  %tile15 = aie.tile(1, 5)
+
+  %tile22 = aie.tile(2, 2)
+  %tile23 = aie.tile(2, 3)
+  %tile24 = aie.tile(2, 4)
+  %tile25 = aie.tile(2, 5)
+  //Trace: add flow 
+  aie.flow(%tile24, "Trace" : 0, %tile00, "DMA" : 1)
+
+  %rtp2 = aie.buffer(%tile02) {sym_name = "rtp2"} : memref<16xi32>
+  %rtp3 = aie.buffer(%tile03) {sym_name = "rtp3"} : memref<16xi32>
+  %rtp4 = aie.buffer(%tile04) {sym_name = "rtp4"} : memref<16xi32>
+  %rtp5 = aie.buffer(%tile05) {sym_name = "rtp5"} : memref<16xi32>
+
+  %rtp12 = aie.buffer(%tile12) {sym_name = "rtp12"} : memref<16xi32>
+  %rtp13 = aie.buffer(%tile13) {sym_name = "rtp13"} : memref<16xi32>
+  %rtp14 = aie.buffer(%tile14) {sym_name = "rtp14"} : memref<16xi32>
+  %rtp15 = aie.buffer(%tile15) {sym_name = "rtp15"} : memref<16xi32>
+
+  %rtp22 = aie.buffer(%tile22) {sym_name = "rtp22"} : memref<16xi32>
+  %rtp23 = aie.buffer(%tile23) {sym_name = "rtp23"} : memref<16xi32>
+  %rtp24 = aie.buffer(%tile24) {sym_name = "rtp24"} : memref<16xi32>
+  %rtp25 = aie.buffer(%tile25) {sym_name = "rtp25"} : memref<16xi32>
+
+  // ___________________________Bottleneck 1___________________________
+    //initial activation for 1x1
+    aie.objectfifo @inOF_act_L3L2(%tile00, {%tile02,%tile01},[2,2,4]): !aie.objectfifo<memref<32x1x64xi8>> // from shim broadcast to core2 and memtile
+    aie.objectfifo @skip_buf(%tile01, {%tile05}, 2: i32): !aie.objectfifo<memref<32x1x64xi8>> // link the skip buffer in memtile to conv1_skip in tile4
+    aie.objectfifo.link[@inOF_act_L3L2]-> [@skip_buf] ()
+    
+    //wts
+    aie.objectfifo @inOF_wts_0_L3L2(%tile00, {%tile01}, 1 : i32) : !aie.objectfifo<memref<73728xi8>> // total buffer for weights
+    aie.objectfifo @wts_buf_00(%tile01, {%tile02}, 1 : i32) : !aie.objectfifo<memref<4096xi8>> // L1 buffer for first conv1x1 weights 256x64x1x1= 16384
+    aie.objectfifo @wts_buf_01(%tile01, {%tile03,%tile04}, 1 : i32) : !aie.objectfifo<memref<36864xi8>> // L1 buffer for middle conv3x3 weights 64x64x3x3= 36864
+    aie.objectfifo @wts_buf_02(%tile01, {%tile05}, 1 : i32) : !aie.objectfifo<memref<32768xi8>> // L1 buffer for final conv1x1 weights 64x256x1x1= 16384
+    aie.objectfifo.link[@inOF_wts_0_L3L2]-> [@wts_buf_00,@wts_buf_01,@wts_buf_02] ()
+    
+    // OF for intermediate ofm between 1x1 and 3x3
+    aie.objectfifo @act_2_3_4(%tile02, {%tile03,%tile04}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>> //32x1x32
+    // OF for intermediate ofm between 3x3 and 1x1
+    aie.objectfifo @act_3_5(%tile03, {%tile05}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+    aie.objectfifo @act_4_5(%tile04, {%tile05}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+
+  // ___________________________Bottleneck 2___________________________
+    //wts
+    aie.objectfifo @inOF_wts_1_L3L2(%tile10, {%tile11}, 1 : i32) : !aie.objectfifo<memref<69632xi8>> // total buffer for weights
+    aie.objectfifo @wts_buf_10(%tile11, {%tile15}, 1 : i32) : !aie.objectfifo<memref<16384xi8>> // L1 buffer for first conv1x1 weights 256x64x1x1= 16384
+    aie.objectfifo @wts_buf_11(%tile11, {%tile12,%tile14}, 1 : i32) : !aie.objectfifo<memref<36864xi8>> // L1 buffer for middle conv3x3 weights 64x64x3x3= 36864
+    aie.objectfifo @wts_buf_12(%tile11, {%tile13}, 1 : i32) : !aie.objectfifo<memref<16384xi8>> // L1 buffer for final conv1x1 weights 64x256x1x1= 16384
+    aie.objectfifo.link[@inOF_wts_1_L3L2]-> [@wts_buf_10,@wts_buf_11,@wts_buf_12] ()
+
+    //initial activation for 1x1
+    aie.objectfifo @act_05_15(%tile05, {%tile15,%tile01},[2,2,4]): !aie.objectfifo<memref<32x1x256xui8>> // from shim broadcast to core2 and memtile
+    aie.objectfifo @skip_buf2(%tile01, {%tile13}, 2: i32): !aie.objectfifo<memref<32x1x256xui8>> // link the skip buffer in memtile to conv1_skip in tile4
+    aie.objectfifo.link[@act_05_15]-> [@skip_buf2] ()
+    
+    // OF for intermediate ofm between 1x1 and 3x3
+    aie.objectfifo @act_15_12_14(%tile15, {%tile12,%tile14}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>> //32x1x32
+    
+    // OF for intermediate ofm between 3x3 and 1x1
+    aie.objectfifo @act_12_13(%tile12, {%tile13}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+    aie.objectfifo @act_14_13(%tile14, {%tile13}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+
+
+  // ___________________________Bottleneck 3___________________________
+    //wts
+    aie.objectfifo @inOF_wts_2_L3L2(%tile20, {%tile21}, 1 : i32) : !aie.objectfifo<memref<69632xi8>> // total buffer for weights
+    aie.objectfifo @wts_buf_20(%tile21, {%tile22}, 1 : i32) : !aie.objectfifo<memref<16384xi8>> // L1 buffer for first conv1x1 weights 256x64x1x1= 16384
+    aie.objectfifo @wts_buf_21(%tile21, {%tile23,%tile25}, 1 : i32) : !aie.objectfifo<memref<36864xi8>> // L1 buffer for middle conv3x3 weights 64x64x3x3= 36864
+    aie.objectfifo @wts_buf_22(%tile21, {%tile24}, 1 : i32) : !aie.objectfifo<memref<16384xi8>> // L1 buffer for final conv1x1 weights 64x256x1x1= 16384
+    aie.objectfifo.link[@inOF_wts_2_L3L2]-> [@wts_buf_20,@wts_buf_21,@wts_buf_22] ()
+
+    //initial activation for 1x1
+    aie.objectfifo @act_13_22(%tile13, {%tile22,%tile21},[2,2,4]): !aie.objectfifo<memref<32x1x256xui8>> // from shim broadcast to core2 and memtile
+    aie.objectfifo @skip_buf3(%tile21, {%tile24}, 2: i32): !aie.objectfifo<memref<32x1x256xui8>> // link the skip buffer in memtile to conv1_skip in tile4
+    aie.objectfifo.link[@act_13_22]-> [@skip_buf3] ()
+    
+    // OF for intermediate ofm between 1x1 and 3x3
+    aie.objectfifo @act_22_23_25(%tile22, {%tile23,%tile25}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>> //32x1x32
+    
+    // OF for intermediate ofm between 3x3 and 1x1
+    aie.objectfifo @act_23_24(%tile23, {%tile24}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+    aie.objectfifo @act_25_24(%tile25, {%tile24}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>> //32x1x32
+
+    // Final output OF
+    aie.objectfifo @outOFL2L3(%tile24, {%tile10}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>> //32x1x64
+
+  // ___________________________Kernel Call___________________________
+    func.func private @conv2dk1_i8(memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
+    func.func private @conv2dk3_ui8(memref<32x1x64xui8>,memref<32x1x64xui8>, memref<32x1x64xui8>,  memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+    func.func private @conv2dk1_skip_init_i8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<32768xi8>,memref<32x1x256xui8>,memref<32x1x64xi8>,i32,i32,i32,i32,i32,i32,i32) -> ()  
+    
+    func.func private @conv2dk1_ui8(memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
+    func.func private @conv2dk1_skip_ui8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<16384xi8>,memref<32x1x256xui8>,memref<32x1x256xui8>,i32,i32,i32,i32,i32) -> ()
+  // ___________________________Bottleneck 1___________________________
+    // 1x1 conv
+    aie.core(%tile02) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim = arith.constant 32 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 64 : i32
+
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        %subviewWts = aie.objectfifo.acquire @wts_buf_00(Consume, 1) : !aie.objectfifosubview<memref<4096xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<4096xi8>> -> memref<4096xi8>
+        %scale = memref.load %rtp2[%c0] : memref<16xi32>
+        
+        scf.for %n = %c0 to %y_dim step %c1 {
+          %subviewIn = aie.objectfifo.acquire @inOF_act_L3L2(Consume, 1) : !aie.objectfifosubview<memref<32x1x64xi8>>
+          %elemIn = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xi8>> -> memref<32x1x64xi8>      
+
+          %subviewOut = aie.objectfifo.acquire @act_2_3_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          
+          func.call @conv2dk1_i8(%elemIn,%elemWts,  %elemOut0,%x_dim,%ci,%co,%scale) : (memref<32x1x64xi8>,memref<4096xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
+
+          aie.objectfifo.release @inOF_act_L3L2(Consume, 1)
+          aie.objectfifo.release @act_2_3_4(Produce, 1)
+      
+        }
+        aie.objectfifo.release @wts_buf_00(Consume, 1)
+      }
+      aie.end
+    } { link_with="conv2dk1_i8.o" }
+
+    // 3x3 conv
+    aie.core(%tile03) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 0 : i32
+      // acquire wts once
+      // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+      // %scale = memref.load %rtp3[%c0] : memref<16xi32>
+
+      %scale = arith.constant 1 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        %subviewWts = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+    
+          %subviewIn = aie.objectfifo.acquire @act_2_3_4(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_3_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_3_5(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_2_3_4(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_3_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+        
+            func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_3_5(Produce, 1)
+            aie.objectfifo.release @act_2_3_4(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_2_3_4(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_3_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+      
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_3_5(Produce, 1)
+          aie.objectfifo.release @act_2_3_4(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_01(Consume, 1)
+      }
+        // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+      aie.end
+    } { link_with="conv2dk3.o" }
+
+    // 3x3 conv
+    aie.core(%tile04) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 32 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      // %scale = memref.load %rtp4[%c0] : memref<16xi32>
+      %scale = arith.constant 1 : i32
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+        %subviewWts = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+    
+          %subviewIn = aie.objectfifo.acquire @act_2_3_4(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_4_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_4_5(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_2_3_4(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_4_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+      
+          func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_4_5(Produce, 1)
+            aie.objectfifo.release @act_2_3_4(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_2_3_4(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_4_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+    
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_4_5(Produce, 1)
+          aie.objectfifo.release @act_2_3_4(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_01(Consume, 1)
+          // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+        }
+        aie.end
+    
+    } { link_with="conv2dk3.o" }
+
+    // 1x1 conv with skip
+    aie.core(%tile05) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim = arith.constant 32 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 256 : i32
+      %ci_skip = arith.constant 64 : i32
+
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        %subviewWts = aie.objectfifo.acquire @wts_buf_02(Consume, 1) : !aie.objectfifosubview<memref<32768xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<32768xi8>> -> memref<32768xi8>
+
+        %scale = memref.load %rtp5[%c0] : memref<16xi32>
+        %skip_scale = memref.load %rtp5[%c1] : memref<16xi32>
+        %skip_conv_scale = memref.load %rtp5[%c2] : memref<16xi32>
+
+        // %skip_scale = arith.constant 0 : i32
+        scf.for %n = %c0 to %y_dim step %c1 {
+          %subviewIn0 = aie.objectfifo.acquire @act_3_5(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn0[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>      
+
+          %subviewIn1 = aie.objectfifo.acquire @act_4_5(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>   
+
+          %subviewOut = aie.objectfifo.acquire @act_05_15(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+          %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
+
+          %subviewSkip = aie.objectfifo.acquire @skip_buf(Consume, 1) : !aie.objectfifosubview<memref<32x1x64xi8>>
+          %elemSkip = aie.objectfifo.subview.access %subviewSkip[0] : !aie.objectfifosubview<memref<32x1x64xi8>> -> memref<32x1x64xi8>    
+
+          
+          // %skip_scale = arith.constant 0 : i32
+          func.call @conv2dk1_skip_init_i8(%elemIn0,%elemIn1,%elemWts, %elemOut0,%elemSkip,%x_dim,%ci,%co,%ci_skip,%scale,%skip_scale,%skip_conv_scale) : (memref<32x1x32xui8>,memref<32x1x32xui8>,  memref<32768xi8>,memref<32x1x256xui8>,memref<32x1x64xi8>,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+          aie.objectfifo.release @act_05_15(Produce, 1)
+          aie.objectfifo.release @act_3_5(Consume, 1)
+          aie.objectfifo.release @act_4_5(Consume, 1)
+          aie.objectfifo.release @skip_buf(Consume, 1)
+      
+        }
+        aie.objectfifo.release @wts_buf_02(Consume, 1)
+      }
+      aie.end
+    } { link_with="conv2dk1_skip_init.o" }
+  // ___________________________Bottleneck 2___________________________
+    // 1x1 conv
+    aie.core(%tile15) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim = arith.constant 32 : index
+  
+      %ci = arith.constant 256 : i32
+      %co = arith.constant 64 : i32
+
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        %subviewWts = aie.objectfifo.acquire @wts_buf_10(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
+        %scale = memref.load %rtp15[%c0] : memref<16xi32>
+        
+        scf.for %n = %c0 to %y_dim step %c1 {
+          %subviewIn = aie.objectfifo.acquire @act_05_15(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+          %elemIn = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>      
+
+          %subviewOut = aie.objectfifo.acquire @act_15_12_14(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          
+          func.call @conv2dk1_ui8(%elemIn,%elemWts,  %elemOut0,%x_dim,%ci,%co,%scale) : (memref<32x1x256xui8>,memref<16384xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
+          
+          aie.objectfifo.release @act_05_15(Consume, 1)
+          aie.objectfifo.release @act_15_12_14(Produce, 1)
+      
+        }
+        aie.objectfifo.release @wts_buf_10(Consume, 1)
+      }
+      aie.end
+    } { link_with="conv2dk1_ui8.o" }
+
+    // 3x3 conv
+    aie.core(%tile12) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 0 : i32
+      // acquire wts once
+      // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+      // %scale = memref.load %rtp3[%c0] : memref<16xi32>
+
+      %scale = arith.constant 1 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        %subviewWts = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+    
+          %subviewIn = aie.objectfifo.acquire @act_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_12_13(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_15_12_14(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+        
+            func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_12_13(Produce, 1)
+            aie.objectfifo.release @act_15_12_14(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+      
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_12_13(Produce, 1)
+          aie.objectfifo.release @act_15_12_14(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_11(Consume, 1)
+      }
+        // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+      aie.end
+    } { link_with="conv2dk3.o" }
+
+    // 3x3 conv
+    aie.core(%tile14) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 32 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      // %scale = memref.load %rtp4[%c0] : memref<16xi32>
+      %scale = arith.constant 1 : i32
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+        %subviewWts = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+    
+          %subviewIn = aie.objectfifo.acquire @act_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_14_13(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_15_12_14(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+      
+          func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_14_13(Produce, 1)
+            aie.objectfifo.release @act_15_12_14(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+    
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_14_13(Produce, 1)
+          aie.objectfifo.release @act_15_12_14(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_11(Consume, 1)
+          // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+        }
+        aie.end
+    
+    } { link_with="conv2dk3.o" }
+
+    // 1x1 conv with skip
+    aie.core(%tile13) {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
+        %c2 = arith.constant 2 : index
+
+        %x_dim = arith.constant 32 : i32
+        %y_dim = arith.constant 32 : index
+    
+        %ci = arith.constant 64 : i32
+        %co = arith.constant 256 : i32
+
+        %intmax = arith.constant 0xFFFFFFFF : index
+        scf.for %arg3 = %c0 to %intmax step %c1 {
+          // acquire wts once
+          %subviewWts = aie.objectfifo.acquire @wts_buf_12(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
+          %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
+
+          %scale = memref.load %rtp13[%c0] : memref<16xi32>
+          %skip_scale = memref.load %rtp13[%c1] : memref<16xi32>
+          // %skip_scale = arith.constant 0 : i32
+          scf.for %n = %c0 to %y_dim step %c1 {
+            %subviewIn0 = aie.objectfifo.acquire @act_12_13(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemIn0 = aie.objectfifo.subview.access %subviewIn0[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>      
+
+            %subviewIn1 = aie.objectfifo.acquire @act_14_13(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemIn1 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>   
+
+            %subviewOut = aie.objectfifo.acquire @act_13_22(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+            %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
+
+            %subviewSkip = aie.objectfifo.acquire @skip_buf2(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+            %elemSkip = aie.objectfifo.subview.access %subviewSkip[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>    
+
+            
+            // %skip_scale = arith.constant 0 : i32
+            func.call @conv2dk1_skip_ui8(%elemIn0,%elemIn1,%elemWts, %elemOut0,%elemSkip,%x_dim,%ci,%co,%scale,%skip_scale) : (memref<32x1x32xui8>,memref<32x1x32xui8>,  memref<16384xi8>,memref<32x1x256xui8>,memref<32x1x256xui8>,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_13_22(Produce, 1)
+            aie.objectfifo.release @act_12_13(Consume, 1)
+            aie.objectfifo.release @act_14_13(Consume, 1)
+            aie.objectfifo.release @skip_buf2(Consume, 1)
+        
+          }
+          aie.objectfifo.release @wts_buf_12(Consume, 1)
+        }
+        aie.end
+      } { link_with="conv2dk1_skip.o" }
+
+            
+  // ___________________________Bottleneck 3___________________________
+    // 1x1 conv
+    aie.core(%tile22) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim = arith.constant 32 : index
+
+      %ci = arith.constant 256 : i32
+      %co = arith.constant 64 : i32
+
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        %subviewWts = aie.objectfifo.acquire @wts_buf_20(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
+        %scale = memref.load %rtp22[%c0] : memref<16xi32>
+        
+        scf.for %n = %c0 to %y_dim step %c1 {
+          %subviewIn = aie.objectfifo.acquire @act_13_22(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+          %elemIn = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>      
+
+          %subviewOut = aie.objectfifo.acquire @act_22_23_25(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          
+          func.call @conv2dk1_ui8(%elemIn,%elemWts,  %elemOut0,%x_dim,%ci,%co,%scale) : (memref<32x1x256xui8>,memref<16384xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
+          
+          aie.objectfifo.release @act_13_22(Consume, 1)
+          aie.objectfifo.release @act_22_23_25(Produce, 1)
+      
+        }
+        aie.objectfifo.release @wts_buf_20(Consume, 1)
+      }
+      aie.end
+    } { link_with="conv2dk1_ui8.o" }
+
+    // 3x3 conv
+    aie.core(%tile23) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+  
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 0 : i32
+      // acquire wts once
+      // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+      // %scale = memref.load %rtp3[%c0] : memref<16xi32>
+
+      %scale = arith.constant 1 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        %subviewWts = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+    
+          %subviewIn = aie.objectfifo.acquire @act_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_23_24(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_22_23_25(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+        
+            func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_23_24(Produce, 1)
+            aie.objectfifo.release @act_22_23_25(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+      
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_23_24(Produce, 1)
+          aie.objectfifo.release @act_22_23_25(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_21(Consume, 1)
+      }
+        // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+      aie.end
+    } { link_with="conv2dk3.o" }
+      
+    // 3x3 conv
+    aie.core(%tile25) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim_minus_2 = arith.constant 30 : index
+
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 32 : i32
+
+      %kx_dim = arith.constant 3 : i32
+      %ky_dim = arith.constant 3 : i32
+      
+      %top = arith.constant 0 : i32
+      %middle = arith.constant 1 : i32
+      %bottom = arith.constant 2 : i32
+
+      %co_offset = arith.constant 32 : i32
+      %intmax = arith.constant 0xFFFFFFFF : index
+      // %scale = memref.load %rtp4[%c0] : memref<16xi32>
+      %scale = arith.constant 1 : i32
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        // %subviewWts = aie.objectfifo.acquire<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1) : !aie.objectfifosubview<memref<32x32x3x3xi32>>
+        %subviewWts = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
+
+          // Preamble : Top Border
+
+          %subviewIn = aie.objectfifo.acquire @act_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut = aie.objectfifo.acquire @act_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+          
+          func.call @conv2dk3_ui8(%elemIn0,%elemIn0,%elemIn1,%elemWts, %elemOut,%x_dim,%ci,%co,%kx_dim,%ky_dim,%top,%scale,%co_offset  ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+      
+          aie.objectfifo.release @act_25_24(Produce, 1)
+          
+          // Middle
+          scf.for %n = %c0 to %y_dim_minus_2 step %c1 {
+            %subviewIn1 = aie.objectfifo.acquire @act_22_23_25(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
+            %elemIn1_0 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_1 = aie.objectfifo.subview.access %subviewIn1[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+            %elemIn1_2 = aie.objectfifo.subview.access %subviewIn1[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+            %subviewOut1 = aie.objectfifo.acquire @act_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+            %elemOut1 = aie.objectfifo.subview.access %subviewOut1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+        
+          func.call @conv2dk3_ui8(%elemIn1_0,%elemIn1_1,%elemIn1_2,%elemWts, %elemOut1,%x_dim,%ci,%co,%kx_dim,%ky_dim,%middle,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+
+            aie.objectfifo.release @act_25_24(Produce, 1)
+            aie.objectfifo.release @act_22_23_25(Consume, 1)
+      
+        }
+        // Postamble : Bottom Border
+          %subviewIn2 = aie.objectfifo.acquire @act_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
+          %elemIn2_0 = aie.objectfifo.subview.access %subviewIn2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+          %elemIn2_1 = aie.objectfifo.subview.access %subviewIn2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
+
+          %subviewOut2 = aie.objectfifo.acquire @act_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemOut2 = aie.objectfifo.subview.access %subviewOut2[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
+          
+      
+          func.call @conv2dk3_ui8(%elemIn2_0,%elemIn2_1,%elemIn2_1,%elemWts, %elemOut2,%x_dim,%ci,%co,%kx_dim,%ky_dim,%bottom,%scale,%co_offset ) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
+          
+
+          aie.objectfifo.release @act_25_24(Produce, 1)
+          aie.objectfifo.release @act_22_23_25(Consume, 2)
+          
+          //release weights
+          aie.objectfifo.release @wts_buf_21(Consume, 1)
+          // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
+          }
+        aie.end
+      
+    } { link_with="conv2dk3.o" }
+    
+    // 1x1 conv with skip
+    aie.core(%tile24) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+
+      %x_dim = arith.constant 32 : i32
+      %y_dim = arith.constant 32 : index
+
+      %ci = arith.constant 64 : i32
+      %co = arith.constant 256 : i32
+
+      %intmax = arith.constant 0xFFFFFFFF : index
+      scf.for %arg3 = %c0 to %intmax step %c1 {
+        // acquire wts once
+        %subviewWts = aie.objectfifo.acquire @wts_buf_22(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
+        %elemWts = aie.objectfifo.subview.access %subviewWts[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
+
+        %scale = memref.load %rtp24[%c0] : memref<16xi32>
+        %skip_scale = memref.load %rtp24[%c1] : memref<16xi32>
+        // %skip_scale = arith.constant 0 : i32
+        scf.for %n = %c0 to %y_dim step %c1 {
+          %subviewIn0 = aie.objectfifo.acquire @act_23_24(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemIn0 = aie.objectfifo.subview.access %subviewIn0[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>      
+
+          %subviewIn1 = aie.objectfifo.acquire @act_25_24(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
+          %elemIn1 = aie.objectfifo.subview.access %subviewIn1[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>   
+
+          %subviewOut = aie.objectfifo.acquire @outOFL2L3(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+          %elemOut0 = aie.objectfifo.subview.access %subviewOut[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
+
+          %subviewSkip = aie.objectfifo.acquire @skip_buf3(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
+          %elemSkip = aie.objectfifo.subview.access %subviewSkip[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>    
+
+          
+          // %skip_scale = arith.constant 0 : i32
+          func.call @conv2dk1_skip_ui8(%elemIn0,%elemIn1,%elemWts, %elemOut0,%elemSkip,%x_dim,%ci,%co,%scale,%skip_scale) : (memref<32x1x32xui8>,memref<32x1x32xui8>,  memref<16384xi8>,memref<32x1x256xui8>,memref<32x1x256xui8>,i32,i32,i32,i32,i32) -> ()
+
+          aie.objectfifo.release @outOFL2L3(Produce, 1)
+          aie.objectfifo.release @act_23_24(Consume, 1)
+          aie.objectfifo.release @act_25_24(Consume, 1)
+          aie.objectfifo.release @skip_buf3(Consume, 1)
+      
+        }
+        aie.objectfifo.release @wts_buf_22(Consume, 1)
+      }
+      aie.end
+    } { link_with="conv2dk1_skip.o" }
+
+    
+  func.func @sequence(%in0 : memref<16384xi32>, %wts0 : memref<53248xi32>, %out : memref<65536xi32>) {
+                  // Trace output
+
+      // Trace_Event0, Trace_Event1: Select which events to trace.
+      // Note that the event buffers only appear to be transferred to DDR in
+      // bursts of 256 bytes. If less than 256 bytes are written, you may not
+      // see trace output, or only see it on the next iteration of your 
+      // kernel invocation, as the buffer gets filled up. Note that, even
+      // though events are encoded as 4 byte words, it may take more than 64 
+      // events to fill the buffer to 256 bytes and cause a flush, since
+      // multiple repeating events can be 'compressed' by the trace mechanism.
+      // In order to always generate sufficient events, we add the "assert 
+      // TRUE" event to one slot, which fires every cycle, and thus fills our
+      // buffer quickly.
+
+      // Some events:
+      // TRUE                       (0x01)
+      // STREAM_STALL               (0x18)
+      // LOCK_STALL                 (0x1A)
+      // EVENTS_CORE_INSTR_EVENT_1  (0x22)
+      // EVENTS_CORE_INSTR_EVENT_0  (0x21)
+      // INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+      // INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock acquire instruction
+      // INSTR_LOCK_RELEASE_REQ     (0x2D)  Core executes a lock release instruction
+      // EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+      // EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+
+      // Trace_Event0  (4 slots)
+      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
+      // Trace_Event1  (4 slots)
+      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 }
+
+      // Event slots as configured above:
+      // 0: Kernel executes vector instruction
+      // 1: Event 0 -- Kernel starts
+      // 2: Event 1 -- Kernel done
+      // 3: Port_Running_0
+      // 4: Port_Running_1
+      // 5: Lock Stall
+      // 6: Lock Acquire Instr
+      // 7: Lock Release Instr
+
+      // Stream_Switch_Event_Port_Selection_0
+      // This is necessary to capture the Port_Running_0 and Port_Running_1 events
+      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 }
+
+      // Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 }
+
+      // Start trace copy out.
+      aiex.ipu.writebd_shimtile { bd_id = 3 : i32,
+                                  buffer_length = 16384 : i32,
+                                  buffer_offset = 262144 : i32,
+                                  enable_packet = 0 : i32,
+                                  out_of_order_id = 0 : i32,
+                                  packet_id = 0 : i32,
+                                  packet_type = 0 : i32,
+                                  column = 0 : i32,
+                                  column_num = 1 : i32,
+                                  d0_stepsize = 0 : i32,
+                                  d0_size = 0 : i32,
+                                  d0_stride = 0 : i32, 
+                                  d0_wrap = 0 : i32,
+                                  d1_stepsize = 0 : i32,
+                                  d1_wrap = 0 : i32,
+                                  d1_size = 0 : i32,
+                                  d1_stride = 0 : i32, 
+                                  d2_stepsize = 0 : i32,
+                                  d2_size = 0 : i32,
+                                  d2_stride = 0 : i32, 
+                                  ddr_id = 2 : i32,
+                                  iteration_current = 0 : i32,
+                                  iteration_stepsize = 0 : i32,
+                                  iteration_wrap = 0 : i32,
+                                  iteration_size = 0 : i32,
+                                  iteration_stride = 0 : i32,
+                                  lock_acq_enable = 0 : i32,
+                                  lock_acq_id = 0 : i32,
+                                  lock_acq_val = 0 : i32,
+                                  lock_rel_id = 0 : i32,
+                                  lock_rel_val = 0 : i32,
+                                  next_bd = 0 : i32,
+                                  use_next_bd = 0 : i32,
+                                  valid_bd = 1 : i32}
+      aiex.ipu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 }
+
+    //End trace dump
+
+      
+
+      aiex.ipu.rtp_write(0, 2, 0,  1) { buffer_sym_name = "rtp2" }  
+      aiex.ipu.rtp_write(0, 3, 0,  1) { buffer_sym_name = "rtp3" } 
+      aiex.ipu.rtp_write(0, 5, 0,  1) { buffer_sym_name = "rtp4" }  
+      aiex.ipu.rtp_write(0, 4, 0,  1)  { buffer_sym_name = "rtp5" }  
+      aiex.ipu.rtp_write(0, 4, 1,  0)  { buffer_sym_name = "rtp5" }  
+      aiex.ipu.rtp_write(0, 4, 2,  1)  { buffer_sym_name = "rtp5" }  
+
+      aiex.ipu.rtp_write(1, 5, 0,  1) { buffer_sym_name = "rtp15" }  
+      aiex.ipu.rtp_write(1, 4, 0,  1) { buffer_sym_name = "rtp14" }  
+      aiex.ipu.rtp_write(1, 2, 0,  1) { buffer_sym_name = "rtp12" }  
+      aiex.ipu.rtp_write(1, 3, 0,  1)  { buffer_sym_name = "rtp13" }  
+      aiex.ipu.rtp_write(1, 3, 1,  0)  { buffer_sym_name = "rtp13" }  
+
+      aiex.ipu.rtp_write(2, 2, 0,  1) { buffer_sym_name = "rtp22" }  
+      aiex.ipu.rtp_write(2, 3, 0,  1) { buffer_sym_name = "rtp23" }  
+      aiex.ipu.rtp_write(2, 5, 0,  1) { buffer_sym_name = "rtp25" }  
+      aiex.ipu.rtp_write(2, 4, 0,  1)  { buffer_sym_name = "rtp24" }  
+      aiex.ipu.rtp_write(2, 4, 1,  0)  { buffer_sym_name = "rtp24" } 
+
+      %c0 = arith.constant 0 : i32
+      %c1 = arith.constant 1 : i32
+      %act_in= arith.constant  16384 : i64 
+      %act_out= arith.constant  65536 : i64 
+      %total_wts = arith.constant  18432 : i64 
+      %total_wts_2 = arith.constant  17408 : i64 
+      %total_wts_3 = arith.constant  17408 : i64 
+      %total_wts_3_off = arith.constant  35840 : i64 
+
+      //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
+      aiex.ipu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
+      aiex.ipu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
+      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
+      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
+      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
+
+      aiex.ipu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+
+    }
+}
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
new file mode 100755
index 0000000000..385a4fc7a5
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -0,0 +1,639 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.dialects.ext.scf import range_, yield_
+from aie.extras.context import mlir_mod_ctx
+from aie.ir import MemRefType, TypeAttr
+
+import sys
+
+# tracing definitions
+trace_sz_in_bytes = 8192
+trace_sz_in_i32s = trace_sz_in_bytes // 4
+enableTrace = False
+
+# Define bottleneck layer sizes
+
+tensorInW = 32
+tensorInH = 32
+tensorInC = 256
+
+tensorL1InC = tensorInC
+tensorL1OutC = tensorL1InC // 4
+
+tensorL2InC = tensorL1OutC
+tensorL2OutC = tensorL2InC
+
+tensorL3InC = tensorL2OutC
+tensorL3OutC = tensorL3InC * 4
+
+
+def bottleneck4AIEs():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def deviceBody():
+
+            # define types
+            uint8_ty = IntegerType.get_unsigned(8)
+            int8_ty = IntegerType.get_signless(8)
+            int16_ty = IntegerType.get_signless(16)
+            int32_ty = IntegerType.get_signless(32)
+
+            tensorLayer1In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL1InC,
+                ),
+                int8_ty,
+            )
+            weightsLayer1_ty = MemRefType.get((tensorL1InC * tensorL1OutC,), int8_ty)
+            tensorLayer1Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL1OutC,
+                ),
+                uint8_ty,
+            )
+
+            tensorLayer2In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL2InC,
+                ),
+                uint8_ty,
+            )
+            weightsLayer2_ty = MemRefType.get(
+                (3 * 3 * tensorL2InC * tensorL2OutC,), int8_ty
+            )
+            tensorLayer2Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL2OutC // 2,
+                ),
+                uint8_ty,
+            )
+
+            tensorLayer3In_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL3InC // 2,
+                ),
+                uint8_ty,
+            )
+            weightsLayer3_ty = MemRefType.get((tensorL3InC * tensorL3OutC,), int8_ty)
+            tensorLayer3Out_ty = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorL3OutC,
+                ),
+                uint8_ty,
+            )
+
+            allWeights_ty = MemRefType.get(
+                (
+                    tensorL1InC * tensorL1OutC
+                    + 3 * 3 * tensorL2InC * tensorL2OutC
+                    + tensorL3InC * tensorL3OutC,
+                ),
+                int8_ty,
+            )
+
+            # kernel definitions
+            conv2dk1 = external_func(
+                "conv2dk1_i8",
+                inputs=[
+                    tensorLayer1In_ty,
+                    weightsLayer1_ty,
+                    tensorLayer1Out_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+            conv2dk3 = external_func(
+                "conv2dk3_ui8",
+                inputs=[
+                    tensorLayer2In_ty,
+                    tensorLayer2In_ty,
+                    tensorLayer2In_ty,
+                    weightsLayer2_ty,
+                    tensorLayer2Out_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+            conv2dk1_skip = external_func(
+                "conv2dk1_skip_i8",
+                inputs=[
+                    tensorLayer3In_ty,
+                    tensorLayer3In_ty,
+                    weightsLayer3_ty,
+                    tensorLayer3Out_ty,
+                    tensorLayer1In_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+
+            ShimTile = tile(0, 0)
+            MemTile = tile(0, 1)
+            ComputeTile2 = tile(0, 2)
+            ComputeTile3 = tile(0, 3)
+            ComputeTile4 = tile(0, 4)
+            ComputeTile5 = tile(0, 5)
+
+            if enableTrace:
+                flow(ComputeTile4, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+            # runtime parameters
+
+            rtpComputeTile2 = Buffer(ComputeTile2, [16], T.i32(), "rtpComputeTile2")
+            rtpComputeTile3 = Buffer(ComputeTile3, [16], T.i32(), "rtpComputeTile3")
+            rtpComputeTile4 = Buffer(ComputeTile4, [16], T.i32(), "rtpComputeTile4")
+            rtpComputeTile5 = Buffer(ComputeTile5, [16], T.i32(), "rtpComputeTile5")
+
+            # set up data movement with OFs
+            # input tensor (with broadcast for skip connection)
+            of_inOF_act_L3L2 = object_fifo(
+                "inOF_act_L3L2",
+                ShimTile,
+                [ComputeTile2, MemTile],
+                [2, 2, 4],
+                tensorLayer1In_ty,
+            )
+            of_skip_buf = object_fifo(
+                "skip_buf", MemTile, ComputeTile4, 2, tensorLayer1In_ty
+            )
+            object_fifo_link(of_inOF_act_L3L2, of_skip_buf)
+
+            # weights
+            inOF_wts_0_L3L2 = object_fifo(
+                "inOF_wts_0_L3L2", ShimTile, MemTile, 1, allWeights_ty
+            )
+            of_wts_buf_00 = object_fifo(
+                "wts_buf_00", MemTile, ComputeTile2, 1, weightsLayer1_ty
+            )
+            wts_buf_01 = object_fifo(
+                "wts_buf_01",
+                MemTile,
+                [ComputeTile3, ComputeTile5],
+                1,
+                weightsLayer2_ty,
+            )
+            wts_buf_02 = object_fifo(
+                "wts_buf_02", MemTile, ComputeTile4, 1, weightsLayer3_ty
+            )
+            object_fifo_link(inOF_wts_0_L3L2, [of_wts_buf_00, wts_buf_01, wts_buf_02])
+
+            # activation tensor
+            of_act_2_3_5 = object_fifo(
+                "act_2_3_5",
+                ComputeTile2,
+                [ComputeTile3, ComputeTile5],
+                [2, 4, 4],
+                tensorLayer1Out_ty,
+            )  # 1x1 -> 3x3
+            act_3_4 = object_fifo(
+                "act_3_4", ComputeTile3, ComputeTile4, 2, tensorLayer2Out_ty
+            )  # 3x3 -> 1x1
+            act_5_4 = object_fifo(
+                "act_5_4", ComputeTile5, ComputeTile4, 2, tensorLayer2Out_ty
+            )  # 3x3 -> 1x1
+
+            # output tensor
+            outOFL2L3 = object_fifo(
+                "outOFL2L3", ComputeTile4, ShimTile, 2, tensorLayer3Out_ty
+            )
+
+            # 1x1 conv2d
+            @core(ComputeTile2, "conv2dk1.o")
+            def core_body():
+                for _ in range_(sys.maxsize):
+
+                    # acquire weights once
+                    element0Weights = of_wts_buf_00.acquire(ObjectFifoPort.Consume, 1)
+                    scale = memref.load(rtpComputeTile2, [0])
+                    for _ in range_(tensorInH):
+                        element0ActivactionsIn = of_inOF_act_L3L2.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        element0ActivactionsOut = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk1,
+                            [
+                                element0ActivactionsIn,
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL1InC,
+                                tensorL1OutC,
+                                scale,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "inOF_act_L3L2", 1)
+
+                        objectfifo_release(ObjectFifoPort.Produce, "act_2_3_5", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_00", 1)
+                    yield_([])
+
+            # 3x3 conv2d OFM 0-31
+            @core(ComputeTile3, "conv2dk3.o")
+            def core_body():
+                scale = 11
+                for _ in range_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
+                    # scale = memref.load(rtpComputeTile3, 0)
+
+                    # pre-amble: top row
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            0,
+                            scale,
+                            0,
+                        ],
+                    )
+                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+
+                    # middle
+                    for _ in range_(tensorInH - 2):
+                        elementActivactionsIn = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Consume, 3
+                        )
+                        element0ActivactionsOut = act_3_4.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk3,
+                            [
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
+                                elementActivactionsIn[2],
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL2InC,
+                                tensorL2OutC,
+                                3,
+                                3,
+                                1,
+                                scale,
+                                0,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+                        yield_([])
+
+                    # last part
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            2,
+                            scale,
+                            0,
+                        ],
+                    )
+
+                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
+                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
+
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
+                    yield_([])
+
+            # 3x3 conv2d OFM 32-63
+            @core(ComputeTile5, "conv2dk3.o")
+            def core_body():
+                scale = 11
+                for _ in range_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
+                    # scale = memref.load(rtpComputeTile5, 0)
+
+                    # pre-amble: top row
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            0,
+                            scale,
+                            tensorL2OutC // 2,
+                        ],
+                    )
+
+                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+
+                    # middle
+                    for _ in range_(tensorInH - 2):
+                        elementActivactionsIn = of_act_2_3_5.acquire(
+                            ObjectFifoPort.Consume, 3
+                        )
+                        element0ActivactionsOut = act_5_4.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+                        res = call(
+                            conv2dk3,
+                            [
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
+                                elementActivactionsIn[2],
+                                element0Weights,
+                                element0ActivactionsOut,
+                                tensorInW,
+                                tensorL2InC,
+                                tensorL2OutC,
+                                3,
+                                3,
+                                1,
+                                scale,
+                                tensorL2OutC // 2,
+                            ],
+                        )
+
+                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
+                        objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+                        yield_([])
+
+                    # last part
+                    elementActivactionsIn = of_act_2_3_5.acquire(
+                        ObjectFifoPort.Consume, 2
+                    )
+                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
+                    res = call(
+                        conv2dk3,
+                        [
+                            elementActivactionsIn[0],
+                            elementActivactionsIn[1],
+                            elementActivactionsIn[1],
+                            element0Weights,
+                            element0ActivactionsOut,
+                            tensorInW,
+                            tensorL2InC,
+                            tensorL2OutC,
+                            3,
+                            3,
+                            2,
+                            scale,
+                            tensorL2OutC // 2,
+                        ],
+                    )
+                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
+                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
+                    yield_([])
+
+            # # 1x1 conv2d and add skip
+            @core(ComputeTile4, "conv2dk1_skip.o")
+            def core_body():
+                for _ in range_(sys.maxsize):
+
+                    # acquire weights and rtps once
+                    element0Weights = wts_buf_02.acquire(ObjectFifoPort.Consume, 1)
+                    scale = memref.load(rtpComputeTile4, [0])
+                    skipScale = memref.load(rtpComputeTile4, [1])
+
+                    for _ in range_(tensorInH):
+                        element0ActivactionsIn = act_3_4.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        element1ActivactionsIn = act_5_4.acquire(
+                            ObjectFifoPort.Consume, 1
+                        )
+                        elementSkipsIn = of_skip_buf.acquire(ObjectFifoPort.Consume, 1)
+                        elementActivactionsOut = outOFL2L3.acquire(
+                            ObjectFifoPort.Produce, 1
+                        )
+
+                        call(
+                            conv2dk1_skip,
+                            [
+                                element0ActivactionsIn,
+                                element1ActivactionsIn,
+                                element0Weights,
+                                elementActivactionsOut,
+                                elementSkipsIn,
+                                tensorInW,
+                                tensorL3InC,
+                                tensorL3OutC,
+                                scale,
+                                skipScale,
+                            ],
+                        )
+                        objectfifo_release(ObjectFifoPort.Produce, "outOFL2L3", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "act_3_4", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "act_5_4", 1)
+                        objectfifo_release(ObjectFifoPort.Consume, "skip_buf", 1)
+                        yield_([])
+                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_02", 1)
+                    yield_([])
+
+            # instruction stream generation
+            activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4
+            acitivationsOutSize32b = activationsInSize32b
+            totalWeightsSize32b = (
+                tensorL1InC * tensorL1OutC
+                + 3 * 3 * tensorL2InC * tensorL2OutC
+                + tensorL3InC * tensorL3OutC
+            ) // 4
+
+            activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty)
+            weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty)
+
+            @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
+            def sequence(inputFromL3, weightsFromL3, outputToL3):
+
+                if enableTrace:
+                    # Trace output
+
+                    # Trace_Event0, Trace_Event1: Select which events to trace.
+                    # Note that the event buffers only appear to be transferred to DDR in
+                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
+                    # see trace output, or only see it on the next iteration of your
+                    # kernel invocation, as the buffer gets filled up. Note that, even
+                    # though events are encoded as 4 byte words, it may take more than 64
+                    # events to fill the buffer to 256 bytes and cause a flush, since
+                    # multiple repeating events can be 'compressed' by the trace mechanism.
+                    # In order to always generate sufficient events, we add the "assert
+                    # TRUE" event to one slot, which fires every cycle, and thus fills our
+                    # buffer quickly.
+
+                    # Some events:
+                    # TRUE                       (0x01)
+                    # STREAM_STALL               (0x18)
+                    # LOCK_STALL                 (0x1A)
+                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
+                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
+                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock .acquire instruction
+                    # INSTR_LOCK_.release_REQ     (0x2D)  Core executes a lock .release instruction
+                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+                    # Trace_Event0  (4 slots)
+                    ipu_write32(0, 4, 0x340E0, 0x4B222125)
+                    # Trace_Event1  (4 slots)
+                    ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F)
+
+                    # Event slots as configured above:
+                    # 0: Kernel executes vector instruction
+                    # 1: Event 0 -- Kernel starts
+                    # 2: Event 1 -- Kernel done
+                    # 3: Port_Running_0
+                    # 4: Port_Running_1
+                    # 5: Lock Stall
+                    # 6: Lock .acquire Instr
+                    # 7: Lock .release Instr
+
+                    # Stream_Switch_Event_Port_Selection_0
+                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
+                    ipu_write32(0, 4, 0x3FF00, 0x121)
+
+                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+                    ipu_write32(0, 4, 0x340D0, 0x10000)
+
+                    # Start trace copy out.
+                    ipu_writebd_shimtile(
+                        bd_id=3,
+                        buffer_length=trace_sz_in_i32s,
+                        buffer_offset=acitivationsOutSize32b,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_stepsize=0,
+                        d0_wrap=0,
+                        d1_stepsize=0,
+                        d1_wrap=0,
+                        d2_stepsize=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_stepsize=0,
+                        iteration_wrap=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    ipu_write32(0, 2, 0x1D20C, 0x3)
+
+                # write RTP parameters
+                IpuWriteRTPOp(
+                    "rtpComputeTile2", col=0, row=2, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile3", col=0, row=3, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile5", col=0, row=5, index=0, value=1
+                )  # scale
+                IpuWriteRTPOp(
+                    "rtpComputeTile4", col=0, row=4, index=0, value=1
+                )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
+                IpuWriteRTPOp(
+                    "rtpComputeTile4", col=0, row=4, index=1, value=0
+                )  # skip_scale
+
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_act_L3L2",
+                    bd_id=0,
+                    mem=inputFromL3,
+                    sizes=[1, 1, 1, activationsInSize32b],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="outOFL2L3",
+                    bd_id=2,
+                    mem=outputToL3,
+                    sizes=[1, 1, 1, acitivationsOutSize32b],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inOF_wts_0_L3L2",
+                    bd_id=1,
+                    mem=weightsFromL3,
+                    sizes=[1, 1, 1, totalWeightsSize32b],
+                )
+
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+bottleneck4AIEs()
diff --git a/programming_examples/ml/resnet/layers_conv2_x/requirements.txt b/programming_examples/ml/resnet/layers_conv2_x/requirements.txt
new file mode 100755
index 0000000000..08ed5eeb4b
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/requirements.txt
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit
new file mode 100755
index 0000000000..61f43e45e6
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit
@@ -0,0 +1,14 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess, torch
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1_i8.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk3.cc -o conv2dk3.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip_init.cc -o conv2dk1_skip_init.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1_ui8.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
+// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py
new file mode 100755
index 0000000000..02dc01b127
--- /dev/null
+++ b/programming_examples/ml/resnet/layers_conv2_x/test.py
@@ -0,0 +1,436 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+import torch
+import torch.nn as nn
+import sys
+import math
+from aie.utils.ml import DataShaper
+import time
+import os
+import numpy as np
+from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+
+torch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+design = "resnet_conv2_x_int8"
+xclbin_path = os.path.abspath("build/final.xclbin")
+insts_path = os.path.abspath("build/insts.txt")
+
+log_folder = "log/"
+if not os.path.exists(log_folder):
+    os.makedirs(log_folder)
+
+num_iter = 1
+npu_time_total = 0
+npu_time_min = 9999999
+npu_time_max = 0
+trace_size = 16384
+enable_trace = False
+trace_file = "log/trace_" + design + ".txt"
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+dtype_in = np.dtype("int8")
+dtype_wts = np.dtype("int8")
+dtype_out = np.dtype("uint8")
+
+shape_in_act = (32, 8, 32, 8)
+shape_total_wts = (212992, 1)
+shape_out = (32, 32, 32, 8)
+
+# ------------------------------------------------------
+# Initialize activation, weights, scaling factor for int8 model
+# ------------------------------------------------------
+int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
+block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
+block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
+block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
+block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor)
+block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
+block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor)
+block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
+block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+init_scale = 0.5
+block_0_relu_1 = 0.5
+block_0_relu_2 = 0.5
+block_0_relu_3 = 0.5
+
+block_0_weight_scale1 = 0.5
+block_0_weight_scale2 = 0.5
+block_0_weight_scale3 = 0.5
+block_0_weight_scale_skip = 0.5
+
+block_1_relu_1 = 0.5
+block_1_relu_2 = 0.5
+block_1_relu_3 = 0.5
+
+block_1_weight_scale1 = 0.5
+block_1_weight_scale2 = 0.5
+block_1_weight_scale3 = 0.5
+block_1_quant_add_1 = 0.5
+
+block_2_relu_1 = 0.5
+block_2_relu_2 = 0.5
+block_2_relu_3 = 0.5
+
+block_2_weight_scale1 = 0.5
+block_2_weight_scale2 = 0.5
+block_2_weight_scale3 = 0.5
+block_2_quant_add_1 = 0.5
+
+block_0_combined_scale1 = -math.log2(
+    init_scale * block_0_weight_scale1 / block_0_relu_1
+)  # RHS after first conv1x1 | clip 0-->255
+block_0_combined_scale2 = -math.log2(
+    block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
+)  # RHS after second conv3x3 | clip 0-->255
+block_0_combined_scale3 = -math.log2(
+    block_0_relu_2 * block_0_weight_scale3 / init_scale
+)  # RHS after third conv1x1 | clip -128-->+127
+block_0_combined_scale_skip = -math.log2(
+    init_scale * block_0_weight_scale_skip / init_scale
+)  # LHS after conv1x1 | clip -128-->+127
+block_0_combined_scale4 = -math.log2(
+    init_scale / block_0_relu_3
+)  # After addition | clip 0-->255
+
+block_1_combined_scale1 = -math.log2(
+    block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
+)  # RHS after first conv1x1 | clip 0-->255
+block_1_combined_scale2 = -math.log2(
+    block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
+)  # RHS after second conv3x3 | clip 0-->255
+block_1_combined_scale3 = -math.log2(
+    block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
+)  # RHS after third conv1x1 | clip -128-->+127
+block_1_combined_scale4 = -math.log2(
+    block_1_quant_add_1 / block_1_relu_3
+)  # After addition | clip 0-->255
+
+block_2_combined_scale1 = -math.log2(
+    block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
+)  # RHS after first conv1x1 | clip 0-->255
+block_2_combined_scale2 = -math.log2(
+    block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
+)  # RHS after second conv3x3 | clip 0-->255
+block_2_combined_scale3 = -math.log2(
+    block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
+)  # RHS after third conv1x1 | clip -128-->+127
+block_2_combined_scale4 = -math.log2(
+    block_2_quant_add_1 / block_2_relu_3
+)  # After addition | clip 0-->255
+
+min = 0
+max = 255
+
+# ------------------------------------------------------
+# Get device, load the xclbin & kernel and register them
+# ------------------------------------------------------
+app = setup_aie(
+    xclbin_path,
+    insts_path,
+    shape_in_act,
+    dtype_in,
+    shape_total_wts,
+    dtype_wts,
+    shape_out,
+    dtype_out,
+    enable_trace=enable_trace,
+    trace_size=trace_size,
+)
+
+
+# ------------------------------------------------------
+# Define your golden reference
+# ------------------------------------------------------
+class resnet_conv2_x_int8(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes=64, planes=64):
+        super(resnet_conv2_x_int8, self).__init__()
+
+        self.shortcut = nn.Conv2d(
+            in_planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+        # Bottleneck 0
+        self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.block_0_conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+        self.block_0_conv3 = nn.Conv2d(
+            planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+
+        self.block_0_relu1 = nn.ReLU()
+        self.block_0_relu2 = nn.ReLU()
+        self.block_0_relu3 = nn.ReLU()
+
+        # Bottleneck 1
+        self.block_1_conv1 = nn.Conv2d(
+            self.expansion * planes, planes, kernel_size=1, bias=False
+        )
+        self.block_1_conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+        self.block_1_conv3 = nn.Conv2d(
+            planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+
+        self.block_1_relu1 = nn.ReLU()
+        self.block_1_relu2 = nn.ReLU()
+        self.block_1_relu3 = nn.ReLU()
+
+        # Bottleneck 2
+        self.block_2_conv1 = nn.Conv2d(
+            self.expansion * planes, planes, kernel_size=1, bias=False
+        )
+        self.block_2_conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+        self.block_2_conv3 = nn.Conv2d(
+            planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+
+        self.block_2_relu1 = nn.ReLU()
+        self.block_2_relu2 = nn.ReLU()
+        self.block_2_relu3 = nn.ReLU()
+
+    def forward(self, x):
+        # **************** Bottleneck 0 ****************
+        block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1
+        block_0_relu1_out = torch.clamp(
+            torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
+            min,
+            max,
+        )  # convert to int and apply relu
+        block_0_conv2_out = (
+            self.block_0_conv2(block_0_relu1_out)
+            * block_0_relu_1
+            * block_0_weight_scale2
+        )
+        block_0_relu2_out = torch.clamp(
+            torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2),
+            min,
+            max,
+        )
+        block_0_conv3_out = (
+            self.block_0_conv3(block_0_relu2_out)
+            * block_0_relu_2
+            * block_0_weight_scale3
+        )
+        block_0_rhf_same_scale = torch.clamp(
+            torch.round(block_0_conv3_out / init_scale), -128, 127
+        )
+
+        block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip
+        block_0_lhs_same_scale = torch.clamp(
+            torch.round(block_0_lhs_conv / init_scale), -128, 127
+        )
+        # convert to int and apply relu
+
+        block_0_skip_add = init_scale * (
+            block_0_rhf_same_scale + block_0_lhs_same_scale
+        )
+        block_0_final_out = torch.clamp(
+            torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max
+        )
+        # **************** Bottleneck 1 ****************
+        block_1_conv1_out = (
+            self.block_1_conv1(block_0_final_out)
+            * block_0_relu_3
+            * block_1_weight_scale1
+        )
+        block_1_relu1_out = torch.clamp(
+            torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1),
+            min,
+            max,
+        )  # convert to int and apply relu
+        block_1_conv2_out = (
+            self.block_1_conv2(block_1_relu1_out)
+            * block_1_relu_1
+            * block_1_weight_scale2
+        )
+        block_1_relu2_out = torch.clamp(
+            torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2),
+            min,
+            max,
+        )
+        block_1_conv3_out = (
+            self.block_1_conv3(block_1_relu2_out)
+            * block_1_relu_2
+            * block_1_weight_scale3
+        )
+        block_1_rhf_same_scale = torch.clamp(
+            torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
+        )
+
+        block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out)
+        block_1_final_out = torch.clamp(
+            torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max
+        )
+
+        # **************** Bottleneck 2 ****************
+        block_2_conv1_out = (
+            self.block_2_conv1(block_1_final_out)
+            * block_1_relu_3
+            * block_2_weight_scale1
+        )
+        block_2_relu1_out = torch.clamp(
+            torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1),
+            min,
+            max,
+        )  # convert to int and apply relu
+        block_2_conv2_out = (
+            self.block_2_conv2(block_2_relu1_out)
+            * block_2_relu_1
+            * block_2_weight_scale2
+        )
+        block_2_relu2_out = torch.clamp(
+            torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2),
+            min,
+            max,
+        )
+        block_2_conv3_out = (
+            self.block_2_conv3(block_2_relu2_out)
+            * block_2_relu_2
+            * block_2_weight_scale3
+        )
+        block_2_rhf_same_scale = torch.clamp(
+            torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
+        )
+
+        block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out)
+        block_2_final_out = block_2_relu_3 * (
+            torch.clamp(
+                torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
+                min,
+                max,
+            )
+        )
+        return block_2_final_out
+
+
+# ------------------------------------------------------
+# Pytorch baseline
+# ------------------------------------------------------
+model = resnet_conv2_x_int8()
+model.eval()
+model.block_0_conv1.weight.data.copy_(block_0_int_weight_1)
+model.block_0_conv2.weight.data.copy_(block_0_int_weight_2)
+model.block_0_conv3.weight.data.copy_(block_0_int_weight_3)
+model.shortcut.weight.data.copy_(block_0_int_weight_skip)
+
+model.block_1_conv1.weight.data.copy_(block_1_int_weight_1)
+model.block_1_conv2.weight.data.copy_(block_1_int_weight_2)
+model.block_1_conv3.weight.data.copy_(block_1_int_weight_3)
+
+model.block_2_conv1.weight.data.copy_(block_2_int_weight_1)
+model.block_2_conv2.weight.data.copy_(block_2_int_weight_2)
+model.block_2_conv3.weight.data.copy_(block_2_int_weight_3)
+
+golden_output = model(int_inp)
+
+# ------------------------------------------------------
+# Reorder input data-layout
+# ------------------------------------------------------
+ds = DataShaper()
+before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+block0_wts1 = ds.reorder_mat(
+    block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block0_wts2 = ds.reorder_mat(
+    block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block0_wts3 = ds.reorder_mat(
+    block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block0_wts_skip = ds.reorder_mat(
+    block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+
+total_wts = np.concatenate(
+    (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
+)
+
+block1_wts1 = ds.reorder_mat(
+    block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block1_wts2 = ds.reorder_mat(
+    block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block1_wts3 = ds.reorder_mat(
+    block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+
+total_wts2 = np.concatenate(
+    (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
+)
+
+block2_wts1 = ds.reorder_mat(
+    block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block2_wts2 = ds.reorder_mat(
+    block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+block2_wts3 = ds.reorder_mat(
+    block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+)
+
+total_wts3 = np.concatenate(
+    (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
+)
+
+total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+# ------------------------------------------------------
+# Main run loop
+# ------------------------------------------------------
+for i in range(num_iter):
+    start = time.time_ns()
+    aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3
+    stop = time.time_ns()
+
+    if enable_trace:
+        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+        write_out_trace(trace, trace_file)
+
+    npu_time = stop - start
+    npu_time_total = npu_time_total + npu_time
+
+# ------------------------------------------------------
+# Reorder output data-layout
+# ------------------------------------------------------
+temp_out = aie_output.reshape(32, 32, 32, 8)
+temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+# ------------------------------------------------------
+# Compare the AIE output and the golden reference
+# ------------------------------------------------------
+print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+assert np.allclose(
+    ofm_mem_fmt_out.detach().numpy(),
+    golden_output.detach().numpy(),
+    rtol=0,
+    atol=block_2_relu_3,
+)
+
+print("\nPASS!\n")