diff --git a/aie_kernels/aie2/conv2dk1.cc b/aie_kernels/aie2/conv2dk1.cc
new file mode 100755
index 0000000000..08eb7312e9
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1.cc
@@ -0,0 +1,413 @@
+//===- conv2dk1.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include
+#include
+#include
+
+#include
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#ifdef SCALAR
+
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_i8_scalar(int8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ int x, ic, oc, ic8, oc8;
+ // scale=-17;
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ int sum = 0;
+ int sum_srs = 0;
+
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ int val = input[(ic * input_width * 8) + (x * 8) + ic8];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+
+ // sum_srs=sum>>scale;
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+ // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+ output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+ }
+ }
+ }
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_ui8_scalar(uint8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ int x, ic, oc, ic8, oc8;
+ // scale=-17;
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ int sum = 0;
+ int sum_srs = 0;
+
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ uint8_t val = input[(ic * input_width * 8) + (x * 8) + ic8];
+ int8_t k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+
+ // sum_srs=sum>>scale;
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+ // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+ output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+ }
+ }
+ }
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: int8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_i8_vector(int8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t *restrict out_ptr = output;
+
+ const int scaleT = scale;
+
+ MMUL4x8x8 acc_tmp[8];
+ for (int x = 0; x < 8; x++) {
+ acc_tmp[x] = aie::zeros();
+ }
+
+ // TODO Keeping this variable gives a wrong behavior and bad schedule!
+ const int iw = input_width;
+ const int iw_32 = (input_width / 4) / 8;
+
+ // const int iw_32_rem = (input_width / 4) % 8;
+ // const int iw_32_rem = (32 / 4) % 8;
+ assert((input_width / 4) % 8 == 0);
+ const int iw_32_rem = 0; // TODO - See restriction
+
+ assert((input_channels / 8) > 2); // Assume IC >= 16
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < 8; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - 256; // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < 8; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ input -= ((input_channels / 8) * iw * 8) -
+ 256; // reset to next input_width/32 block
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ }
+ input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+ kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+ out_ptr += (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32 > 0) {
+
+ if (iw_32_rem > 0) {
+
+ const int ocs = output_channels;
+ const int ics = input_channels;
+
+ for (int oc = 0; oc < (ocs / 8); oc++) {
+ for (int ic = 0; ic < (ics / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < iw_32_rem; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < iw_32_rem; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+ // input ptr for remainder
+ input -= 448; // reset to beginning of input ptr for remainder
+ // kernel ptr already at next oc/8
+ out_ptr += (iw * 8) -
+ (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: uint8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_ui8_vector(uint8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t *restrict out_ptr = output;
+
+ const int scaleT = scale;
+
+ MMUL4x8x8 acc_tmp[8];
+ for (int x = 0; x < 8; x++) {
+ acc_tmp[x] = aie::zeros();
+ }
+
+ // TODO Keeping this variable gives a wrong behavior and bad schedule!
+ const int iw = input_width;
+ const int iw_32 = (input_width / 4) / 8;
+
+ // const int iw_32_rem = (input_width / 4) % 8;
+ // const int iw_32_rem = (32 / 4) % 8;
+ assert((input_width / 4) % 8 == 0);
+ const int iw_32_rem = 0; // TODO - See restriction
+
+ assert((input_channels / 8) > 2); // Assume IC >= 16
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < 8; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - 256; // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < 8; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ input -= ((input_channels / 8) * iw * 8) -
+ 256; // reset to next input_width/32 block
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ }
+ input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+ kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+ out_ptr += (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32 > 0) {
+
+ if (iw_32_rem > 0) {
+
+ const int ocs = output_channels;
+ const int ics = input_channels;
+
+ for (int oc = 0; oc < (ocs / 8); oc++) {
+ for (int ic = 0; ic < (ics / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < iw_32_rem; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < iw_32_rem; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+ // input ptr for remainder
+ input -= 448; // reset to beginning of input ptr for remainder
+ // kernel ptr already at next oc/8
+ out_ptr += (iw * 8) -
+ (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_i8_scalar(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_ui8_scalar(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_i8_vector(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_ui8_vector(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1.h b/aie_kernels/aie2/conv2dk1.h
new file mode 100755
index 0000000000..d3c405435e
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1.h
@@ -0,0 +1,25 @@
+//===- conv2dk1.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_H
+#define _CONV2DK1_H
+
+extern "C" {
+void conv2dk1_i8(int8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale);
+
+void conv2dk1_ui8(uint8_t *input, int8_t *kernels, uint8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale);
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk1_i8.cc b/aie_kernels/aie2/conv2dk1_i8.cc
new file mode 100644
index 0000000000..73a9d8ed12
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_i8.cc
@@ -0,0 +1,224 @@
+//===- conv2dk1.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include
+#include
+#include
+
+#include
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#ifdef SCALAR
+
+const int32_t SMAX = 127;
+const int32_t SMIN = 128;
+
+#ifdef INT8_ACT
+//*****************************************************************************
+// conv2d 1x1 - scalar
+// act: int8, wts: int8, out: int8
+//*****************************************************************************
+void conv2dk1_i8_scalar(int8_t *input, int8_t *kernels, int8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ int x, ic, oc, ic8, oc8;
+ // scale=-17;
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ int sum = 0;
+ int sum_srs = 0;
+
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ int val = input[(ic * input_width * 8) + (x * 8) + ic8];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+
+ // sum_srs=sum>>scale;
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > SMAX) ? SMAX : (sum_srs < -SMIN) ? -SMIN : sum_srs;
+ // sum_srs = input[(oc*input_width*8) + (x*8) + oc8];
+ output[(oc * input_width * 8) + (x * 8) + oc8] = sum_srs;
+ }
+ }
+ }
+
+ event1();
+}
+#endif // INT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 - vector
+// act: int8, wts: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_i8_vector(int8_t *input, int8_t *kernels, int8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(aie::rounding_mode::symmetric_inf); // Needed to saturate
+ // properly to uint8
+
+ int8_t *restrict out_ptr = output;
+
+ const int scaleT = scale;
+
+ MMUL4x8x8 acc_tmp[8];
+ for (int x = 0; x < 8; x++) {
+ acc_tmp[x] = aie::zeros();
+ }
+
+ // TODO Keeping this variable gives a wrong behavior and bad schedule!
+ const int iw = input_width;
+ const int iw_32 = (input_width / 4) / 8;
+
+ // const int iw_32_rem = (input_width / 4) % 8;
+ // const int iw_32_rem = (32 / 4) % 8;
+ assert((input_width / 4) % 8 == 0);
+ const int iw_32_rem = 0; // TODO - See restriction
+
+ assert((input_channels / 8) > 2); // Assume IC >= 16
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < 8; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - 256; // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < 8; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ input -= ((input_channels / 8) * iw * 8) -
+ 256; // reset to next input_width/32 block
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ }
+ input -= (iw_32) * 256; // 8*32, reset beginning of input ptr
+ kernels += (input_channels / 8) * 64; // move to next oc/8 weights
+ out_ptr += (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32 > 0) {
+
+ if (iw_32_rem > 0) {
+
+ const int ocs = output_channels;
+ const int ics = input_channels;
+
+ for (int oc = 0; oc < (ocs / 8); oc++) {
+ for (int ic = 0; ic < (ics / 8); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x = 0; x < iw_32_rem; x++) {
+ aie::vector in_a = aie::load_v<32>(input);
+ input += 32; // act oc0..3(ic0..7)
+ acc_tmp[x].mac(in_a, in_b);
+ }
+ input += (iw * 8) - (iw_32_rem * 32); // Move to next ic/8 position
+ }
+ // input ptr just moves to next section
+ for (int xx = 0; xx < iw_32_rem; xx++) {
+ aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ acc_tmp[xx] = aie::zeros();
+ }
+ // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning of
+ // input ptr for remainder
+ input -= 448; // reset to beginning of input ptr for remainder
+ // kernel ptr already at next oc/8
+ out_ptr += (iw * 8) -
+ (iw_32_rem *
+ 32); // move to next oc/8 (skip remainder section if present)
+ }
+
+ } // if(iw_32_rem > 0)
+
+ event1();
+}
+#endif // INT8_ACT
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_i8_scalar(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+#endif // INT8_ACT
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale) {
+ conv2dk1_i8_vector(input, kernels, output, input_width, input_channels,
+ output_channels, scale);
+}
+#endif // INT8_ACT
+#endif // Vector
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_i8.h b/aie_kernels/aie2/conv2dk1_i8.h
new file mode 100644
index 0000000000..98925f8a86
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_i8.h
@@ -0,0 +1,22 @@
+//===- conv2dk1.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_H
+#define _CONV2DK1_H
+
+extern "C" {
+void conv2dk1_i8(int8_t *input, int8_t *kernels, int8_t *output,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale);
+
+} // extern "C"
+
+#endif
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip.cc b/aie_kernels/aie2/conv2dk1_skip.cc
new file mode 100755
index 0000000000..feaa95333b
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip.cc
@@ -0,0 +1,766 @@
+//===- conv2dk1_skip.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include
+#include
+#include
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include
+
+#ifdef SCALAR
+
+const int32_t MIN = 128;
+const int32_t MAX = 127;
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - scalar
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_i8_scalar(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ event0();
+
+ int x, ic, ic2, oc, oc8, ic8, ic8b;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ // const int scaleT = 10;
+ // const int skip_scaleT = 0;
+
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ int sum = 0;
+ int sum_srs = 0;
+ int64_t skip_sum = 0;
+ int skip_sum_srs_final = 0;
+ int skip_sum_srs_final_out = 0;
+ int skip_temp = 0;
+ for (ic = 0; ic < input_channels / 16; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ // int val = input0[ic * input_width + x];
+ int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+ // int k = kernels[oc * input_channels + ic];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+ // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+ for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+ for (ic8b = 0; ic8b < 8; ic8b++) {
+ // int val2 = input1[ic2 * input_width + x];
+ int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+ ic8b]; // TODO ic2 should be shifted?
+ // int k2 = kernels[oc * input_channels + ic2];
+ int k2 = kernels[(oc * (input_channels / 8) * 64) +
+ ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+ oc8];
+ sum += val2 * k2;
+ }
+ }
+ // scale for convolution
+ sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+ sum_srs = (sum_srs > MAX) ? MAX
+ : (sum_srs < -MIN) ? -MIN
+ : sum_srs; // clip
+ // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+ // //clip
+
+ // scale for residual
+ // skip_temp=skip[oc * input_width + x];
+ skip_temp = skip[(oc * input_width * 8) + (x * 8) + oc8];
+ skip_sum = sum_srs + skip_temp;
+ // skip_sum= sum_srs;
+
+ skip_sum_srs_final =
+ (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+ skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+ : (skip_sum_srs_final < 0)
+ ? 0
+ : skip_sum_srs_final; // clip
+
+ // output[oc * input_width + x] = skip_sum_srs_final_out;
+ output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+ // output[oc * input_width + x] = sum;
+ // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+ }
+ }
+ }
+
+ // for (oc = 0; oc < output_channels; ++oc) {
+ // for (x = 0; x < input_width; ++x) {
+ // output[oc * input_width + x]=skip[oc * input_width + x];}
+ // }
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - scalar
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_ui8_scalar(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ event0();
+
+ int x, ic, ic2, oc, oc8, ic8, ic8b;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ // const int scaleT = 10;
+ // const int skip_scaleT = 0;
+
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ int sum = 0;
+ int sum_srs = 0;
+ int skip_sum = 0;
+ int skip_sum_srs_final = 0;
+ int skip_sum_srs_final_out = 0;
+ uint8_t skip_temp = 0;
+ for (ic = 0; ic < input_channels / 16; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ // int val = input0[ic * input_width + x];
+ uint8_t val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+ // int k = kernels[oc * input_channels + ic];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+ for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+ for (ic8b = 0; ic8b < 8; ic8b++) {
+ // int val2 = input1[ic2 * input_width + x];
+ uint8_t val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+ ic8b]; // TODO ic2 should be shifted?
+ // int k2 = kernels[oc * input_channels + ic2];
+ int k2 = kernels[(oc * (input_channels / 8) * 64) +
+ ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+ oc8];
+ sum += val2 * k2;
+ }
+ }
+ // scale for convolution
+ sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+ sum_srs = (sum_srs > MAX) ? MAX
+ : (sum_srs < -MIN) ? -MIN
+ : sum_srs; // clip
+
+ // scale for residual
+ skip_temp = skip[(oc * input_width * 8) + (x * 8) + oc8];
+ skip_sum = sum_srs + skip_temp;
+
+ // skip_sum= sum_srs;
+
+ skip_sum_srs_final =
+ (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+ skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+ : (skip_sum_srs_final < 0)
+ ? 0
+ : skip_sum_srs_final; // clip
+
+ // output[oc * input_width + x] = skip_sum_srs_final_out;
+ output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+ // output[oc * input_width + x] = sum;
+ // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+ }
+ }
+ }
+
+ // for (oc = 0; oc < output_channels; ++oc) {
+ // for (x = 0; x < input_width; ++x) {
+ // output[oc * input_width + x]=skip[oc * input_width + x];}
+ // }
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - vector
+// act: uint8, wts: int8, skip: int8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_skip_i8_vector(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t *restrict out_ptr = output;
+ int8_t *i_out_ptr = (int8_t *)output;
+ int8_t *restrict skip_ptr = skip;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+
+ constexpr int NUM_ACC = 8;
+
+ const int iw_32 = (input_width / 4) / 8;
+ const int iw = input_width;
+ // const int iw_32_rem = (input_width / 4) % 8;
+ assert((input_width / 4) % 8 == 0);
+ const int iw_32_rem = 0; // TODO - See restriction
+
+ assert((input_channels / 8) > 2); // Assume IC >= 16
+
+ int input_offset1 = 0;
+ int input_offset2 = 0;
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int x = 0; x < iw_32; x++) {
+ MMUL4x8x8 acc_tmp[NUM_ACC];
+ for (int i = 0; i < NUM_ACC; i++) {
+ acc_tmp[i] = aie::zeros();
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(input0 + input_offset1);
+ input_offset1 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset1 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(input1 + input_offset2);
+ input_offset2 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset2 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ // input ptr just moves to next section
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector skip1 = aie::load_v<32>(skip_ptr);
+ skip_ptr += 32;
+
+ aie::accum accj;
+ accj.from_vector(skip1, 0);
+ accj = aie::add(accj, acc_tmp[x8].to_vector(scaleT));
+ // accj = aie::mac(accj, acc_tmp[x8].to_vector(scaleT),
+ // (uint8_t)1);
+ aie::vector o1 = accj.to_vector(skip_scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ // acc_tmp[x8] = aie::zeros();
+ }
+ input_offset1 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset2 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ } // for(int x=0; x skip1 = aie::load_v<32>(skip_ptr);
+ // skip_ptr += 32; aie::vector skip1 =
+ // aie::load_v<32>(skip_ptr); skip_ptr += 32;
+ // // aie::vector tmp = aie::load_v<32>(out_ptr);
+ // aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // i_out_ptr += 32; aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 =
+ // accj.to_vector(skip_scaleT); aie::store_v(out_ptr,
+ // o3); out_ptr += 32;
+ // }
+ // }
+ // out_ptr += (iw_32_rem*32);
+ // skip_ptr += (iw_32_rem*32);
+ // }
+
+ out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+ skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+ } // if(iw_32 > 0) {
+
+ // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+ // if(iw_32_rem > 0) {
+
+ // const int ocs = output_channels;
+ // const int ics = input_channels;
+
+ // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+ // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+ // for(int oc=0; oc<(ocs/8); oc++) {
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position, TODO -(iw_32_rem*8)??
+ // }
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position
+ // }
+ // // input ptr just moves to next section
+ // for(int xx=0; xx o1 = acc_tmp[xx].to_vector(scaleT);
+ // aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ // // aie::store_v(out_ptr, o1); out_ptr += 32;
+ // aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+ // acc_tmp[xx] = aie::zeros();
+ // }
+ // // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+ // of input ptr for remainder input_offset1 -= 448; // reset to
+ // beginning of input ptr for remainder input_offset2 -= 448; // reset
+ // to beginning of input ptr for remainder
+ // // kernel ptr already at next oc/8
+ // i_out_ptr += (iw*8)-(iw_32_rem*32); // move to next oc/8
+ // (skip remainder section if present)
+ // }
+
+ // i_out_ptr -= output_channels*iw;
+
+ // for(int oc=0; oc<(output_channels/8); oc++) {
+ // for(int x8=0; x8 skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+ // 32; aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 = accj.to_vector(skip_scaleT);
+ // aie::store_v(out_ptr, o3); out_ptr += 32;
+ // }
+ // out_ptr += (iw*8)-(iw_32_rem*32);
+ // skip_ptr += (iw*8)-(iw_32_rem*32);
+ // }
+
+ // } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip - vector
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//
+// Assume IC >= 16 as that gives ideal inner loop schedule
+//
+// TODO - Restricting input_width is mutiple of 32
+// Because each VMAC works on 4 inputs at a time and we store intermediate
+// results in 8 accumulators, having input_width be a multiple of 4*8=32 is
+// ideal. However, we should be able to support input_width that is only a
+// multiple of 4 but there is some strange scheduling happening now so for
+// now, we do not.
+//*****************************************************************************
+void conv2dk1_skip_ui8_vector(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t *restrict out_ptr = output;
+ int8_t *i_out_ptr = (int8_t *)output;
+ uint8_t *restrict skip_ptr = skip;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+
+ constexpr int NUM_ACC = 8;
+
+ const int iw_32 = (input_width / 4) / 8;
+ const int iw = input_width;
+ // const int iw_32_rem = (input_width / 4) % 8;
+ assert((input_width / 4) % 8 == 0);
+ const int iw_32_rem = 0; // TODO - See restriction
+
+ assert((input_channels / 8) > 2); // Assume IC >= 16
+
+ int input_offset1 = 0;
+ int input_offset2 = 0;
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int x = 0; x < iw_32; x++) {
+ MMUL4x8x8 acc_tmp[NUM_ACC];
+ for (int i = 0; i < NUM_ACC; i++) {
+ acc_tmp[i] = aie::zeros();
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(input0 + input_offset1);
+ input_offset1 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset1 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(input1 + input_offset2);
+ input_offset2 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset2 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ // input ptr just moves to next section
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector skip1 = aie::load_v<32>(skip_ptr);
+ skip_ptr += 32;
+
+ aie::accum accj;
+ accj.from_vector(skip1, 0);
+ accj = aie::add(accj, acc_tmp[x8].to_vector(scaleT));
+ // accj = aie::mac(accj, acc_tmp[x8].to_vector(scaleT),
+ // (uint8_t)1);
+ aie::vector o1 = accj.to_vector(skip_scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ // acc_tmp[x8] = aie::zeros();
+ }
+ input_offset1 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset2 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ } // for(int x=0; x skip1 = aie::load_v<32>(skip_ptr);
+ // skip_ptr += 32; aie::vector skip1 =
+ // aie::load_v<32>(skip_ptr); skip_ptr += 32;
+ // // aie::vector tmp = aie::load_v<32>(out_ptr);
+ // aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // i_out_ptr += 32; aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 =
+ // accj.to_vector(skip_scaleT); aie::store_v(out_ptr,
+ // o3); out_ptr += 32;
+ // }
+ // }
+ // out_ptr += (iw_32_rem*32);
+ // skip_ptr += (iw_32_rem*32);
+ // }
+
+ out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+ skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+ } // if(iw_32 > 0) {
+
+ // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+ // if(iw_32_rem > 0) {
+
+ // const int ocs = output_channels;
+ // const int ics = input_channels;
+
+ // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+ // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+ // for(int oc=0; oc<(ocs/8); oc++) {
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position, TODO -(iw_32_rem*8)??
+ // }
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position
+ // }
+ // // input ptr just moves to next section
+ // for(int xx=0; xx o1 = acc_tmp[xx].to_vector(scaleT);
+ // aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ // // aie::store_v(out_ptr, o1); out_ptr += 32;
+ // aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+ // acc_tmp[xx] = aie::zeros();
+ // }
+ // // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+ // of input ptr for remainder input_offset1 -= 448; // reset to
+ // beginning of input ptr for remainder input_offset2 -= 448; // reset
+ // to beginning of input ptr for remainder
+ // // kernel ptr already at next oc/8
+ // i_out_ptr += (iw*8)-(iw_32_rem*32); // move to next oc/8
+ // (skip remainder section if present)
+ // }
+
+ // i_out_ptr -= output_channels*iw;
+
+ // for(int oc=0; oc<(output_channels/8); oc++) {
+ // for(int x8=0; x8 skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+ // 32; aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 = accj.to_vector(skip_scaleT);
+ // aie::store_v(out_ptr, o3); out_ptr += 32;
+ // }
+ // out_ptr += (iw*8)-(iw_32_rem*32);
+ // skip_ptr += (iw*8)-(iw_32_rem*32);
+ // }
+
+ // } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 skip wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip, const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ conv2dk1_skip_i8_scalar(input0, input1, kernels, output, skip, input_width,
+ input_channels, output_channels, scale, skip_scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ conv2dk1_skip_ui8_scalar(input0, input1, kernels, output, skip, input_width,
+ input_channels, output_channels, scale, skip_scale);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip, const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ conv2dk1_skip_i8_vector(input0, input1, kernels, output, skip, input_width,
+ input_channels, output_channels, scale, skip_scale);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale) {
+ conv2dk1_skip_ui8_vector(input0, input1, kernels, output, skip, input_width,
+ input_channels, output_channels, scale, skip_scale);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip.h b/aie_kernels/aie2/conv2dk1_skip.h
new file mode 100755
index 0000000000..8daa62e507
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip.h
@@ -0,0 +1,31 @@
+//===- conv2dk1_skip.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_SKIP_H
+#define _CONV2DK1_SKIP_H
+
+extern "C" {
+
+void conv2dk1_skip_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip, const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale);
+
+void conv2dk1_skip_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale);
+
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk1_skip_init.cc b/aie_kernels/aie2/conv2dk1_skip_init.cc
new file mode 100755
index 0000000000..591377479f
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip_init.cc
@@ -0,0 +1,934 @@
+//===- conv2dk1_skip_init.cc -------------------------------------------------*-
+// C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include
+#include
+#include
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include
+
+#ifdef SCALAR
+
+const int32_t MIN = 128;
+const int32_t MAX = 127;
+const int32_t UMAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - scalar
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+// NOTE: Assumes input_channels >= 16
+void conv2dk1_skip_init_i8_scalar(
+ uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+ int8_t *skip, const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int32_t input_channels_skip,
+ const int scale, const int skip_scale, const int scale_skip_conv) {
+ event0();
+
+ int x, ic, ic2, ic3, oc, oc8, ic8, ic8b, ic8c;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ const int skip_scaleT_conv = scale_skip_conv;
+ const int wts_offset = output_channels * input_channels;
+
+ // const int scaleT = 10;
+ // const int skip_scaleT = 0;
+
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ int sum = 0;
+ int sum_srs = 0;
+ int sum_skip_conv = 0;
+ int sum_skip_conv_srs = 0;
+ int64_t skip_sum = 0;
+ int skip_sum_srs_final = 0;
+ int skip_sum_srs_final_out = 0;
+ int skip_temp = 0;
+ for (ic = 0; ic < input_channels / 16; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ // int val = input0[ic * input_width + x];
+ int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+ // int k = kernels[oc * input_channels + ic];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+ // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+ for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+ for (ic8b = 0; ic8b < 8; ic8b++) {
+ // int val2 = input1[ic2 * input_width + x];
+ int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+ ic8b]; // TODO ic2 should be shifted?
+ // int k2 = kernels[oc * input_channels + ic2];
+ int k2 = kernels[(oc * (input_channels / 8) * 64) +
+ ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+ oc8];
+ sum += val2 * k2;
+ }
+ }
+ // scale for convolution
+ sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+ sum_srs = (sum_srs > MAX) ? MAX
+ : (sum_srs < -MIN) ? -MIN
+ : sum_srs; // clip
+ // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+ // //clip
+ // ********************************************************************************************************************
+ // skip convolution
+ for (ic3 = 0; ic3 < input_channels_skip / 8; ic3++) {
+ for (ic8c = 0; ic8c < 8; ic8c++) {
+ int val3 = skip[(ic3 * input_width * 8) + (x * 8) + ic8c];
+ int k3 = kernels[(oc * (input_channels_skip / 8) * 64) +
+ (ic3 * 64) + (ic8c * 8) + oc8 + wts_offset];
+ sum_skip_conv += val3 * k3;
+ }
+ }
+ sum_skip_conv_srs =
+ (sum_skip_conv + (1 << (skip_scaleT_conv - 1))) >> skip_scaleT_conv;
+ sum_skip_conv_srs = (sum_skip_conv_srs > MAX) ? MAX
+ : (sum_skip_conv_srs < -MIN) ? -MIN
+ : sum_skip_conv_srs;
+ // ********************************************************************************************************************
+ // scale for residual
+ // skip_temp=skip[oc * input_width + x];
+ // skip_temp=skip[(oc*input_width*8) + (x*8) + oc8] ;
+ skip_temp = sum_skip_conv_srs;
+ skip_sum = sum_srs + skip_temp;
+ skip_sum_srs_final =
+ (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+ skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+ : (skip_sum_srs_final < 0)
+ ? 0
+ : skip_sum_srs_final; // clip
+
+ // output[oc * input_width + x] = skip_sum_srs_final_out;
+ output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+ // output[oc * input_width + x] = sum;
+ // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+ }
+ }
+ }
+
+ // for (oc = 0; oc < output_channels; ++oc) {
+ // for (x = 0; x < input_width; ++x) {
+ // output[oc * input_width + x]=skip[oc * input_width + x];}
+ // }
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - scalar
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//
+// NOTE: TODO Currently just a copy of the i8 code. No real differences
+//*****************************************************************************
+void conv2dk1_skip_init_ui8_scalar(
+ uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+ uint8_t *skip, const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int32_t input_channels_skip,
+ const int scale, const int skip_scale, const int scale_skip_conv) {
+ event0();
+
+ int x, ic, ic2, ic3, oc, oc8, ic8, ic8b, ic8c;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ const int skip_scaleT_conv = scale_skip_conv;
+ const int wts_offset = output_channels * input_channels;
+
+ // const int scaleT = 10;
+ // const int skip_scaleT = 0;
+
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ for (oc8 = 0; oc8 < 8; oc8++) {
+ for (x = 0; x < input_width; x++) { // col of output image
+ int sum = 0;
+ int sum_srs = 0;
+ int sum_skip_conv = 0;
+ int sum_skip_conv_srs = 0;
+ int64_t skip_sum = 0;
+ int skip_sum_srs_final = 0;
+ int skip_sum_srs_final_out = 0;
+ int skip_temp = 0;
+ for (ic = 0; ic < input_channels / 16; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ // int val = input0[ic * input_width + x];
+ int val = input0[(ic * input_width * 8) + (x * 8) + ic8];
+ // int k = kernels[oc * input_channels + ic];
+ int k = kernels[(oc * (input_channels / 8) * 64) + (ic * 64) +
+ (ic8 * 8) + oc8];
+ sum += val * k;
+ }
+ }
+ // for (ic2 = input_channels/16; ic2 < input_channels/8; ic2++) {
+ for (ic2 = 0; ic2 < input_channels / 16; ic2++) {
+ for (ic8b = 0; ic8b < 8; ic8b++) {
+ // int val2 = input1[ic2 * input_width + x];
+ int val2 = input1[(ic2 * input_width * 8) + (x * 8) +
+ ic8b]; // TODO ic2 should be shifted?
+ // int k2 = kernels[oc * input_channels + ic2];
+ int k2 = kernels[(oc * (input_channels / 8) * 64) +
+ ((ic2 + (input_channels / 16)) * 64) + (ic8b * 8) +
+ oc8];
+ sum += val2 * k2;
+ }
+ }
+ // scale for convolution
+ sum_srs = (sum + (1 << (scaleT - 1))) >> scaleT;
+ sum_srs = (sum_srs > MAX) ? MAX
+ : (sum_srs < -MIN) ? -MIN
+ : sum_srs; // clip
+ // sum_srs = (sum_srs > UMAX) ? UMAX : (sum_srs < 0) ? 0 : sum_srs;
+ // //clip
+ // ********************************************************************************************************************
+ // skip convolution
+ for (ic3 = 0; ic3 < input_channels_skip / 8; ic3++) {
+ for (ic8c = 0; ic8c < 8; ic8c++) {
+ int val3 = skip[(ic3 * input_width * 8) + (x * 8) + ic8c];
+ int k3 = kernels[(oc * (input_channels_skip / 8) * 64) +
+ (ic3 * 64) + (ic8c * 8) + oc8 + wts_offset];
+ sum_skip_conv += val3 * k3;
+ }
+ }
+ sum_skip_conv_srs =
+ (sum_skip_conv + (1 << (skip_scaleT_conv - 1))) >> skip_scaleT_conv;
+ sum_skip_conv_srs = (sum_skip_conv_srs > MAX) ? MAX
+ : (sum_skip_conv_srs < -MIN) ? -MIN
+ : sum_skip_conv_srs;
+ // ********************************************************************************************************************
+ // scale for residual
+ // skip_temp=skip[oc * input_width + x];
+ // skip_temp=skip[(oc*input_width*8) + (x*8) + oc8] ;
+ skip_temp = sum_skip_conv_srs;
+ skip_sum = sum_srs + skip_temp;
+ skip_sum_srs_final =
+ (skip_sum + (1 << (skip_scaleT - 1))) >> skip_scaleT;
+ skip_sum_srs_final_out = (skip_sum_srs_final > UMAX) ? UMAX
+ : (skip_sum_srs_final < 0)
+ ? 0
+ : skip_sum_srs_final; // clip
+
+ // output[oc * input_width + x] = skip_sum_srs_final_out;
+ output[(oc * input_width * 8) + (x * 8) + oc8] = skip_sum_srs_final_out;
+
+ // output[oc * input_width + x] = sum;
+ // output[oc * input_width + x] = sum+skip[oc * input_width + x];
+ }
+ }
+ }
+
+ // for (oc = 0; oc < output_channels; ++oc) {
+ // for (x = 0; x < input_width; ++x) {
+ // output[oc * input_width + x]=skip[oc * input_width + x];}
+ // }
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - vector
+// act: uint8, wts: int8, skip: int8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_init_i8_vector(
+ uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+ int8_t *skip, const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int32_t input_channels_skip,
+ const int scale, const int skip_scale, const int scale_skip_conv)
+
+{
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ using MMULi4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t * /*restrict*/ out_ptr = output;
+ int8_t *i_out_ptr = (int8_t *)output;
+ // uint8_t * restrict skip_ptr = skip;
+ int8_t *restrict skip_ptr = skip;
+
+ const int wts_offset = output_channels * input_channels;
+ int8_t *kernels_skip = kernels + wts_offset;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ const int scaleT_skip_conv = scale_skip_conv;
+
+ constexpr int NUM_ACC = 8;
+
+ const int iw_32 = (input_width / 4) / 8;
+ const int iw = input_width;
+ const int iw_32_rem = (input_width / 4) % 8;
+
+ int input_offset1 = 0;
+ int input_offset2 = 0;
+ int input_offset3 = 0;
+
+ // aie::vector vec_tmp[NUM_ACC];
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int x = 0; x < iw_32; x++) {
+ aie::vector vec_conv[NUM_ACC];
+ aie::vector vec_skip[NUM_ACC];
+
+ { // conv section
+ MMUL4x8x8 acc_tmp[NUM_ACC];
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ acc_tmp[x8] = aie::zeros();
+ }
+
+ for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+ // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads) For ic > 8, we would load the next 64 weights
+ // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+ // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++)
+ // chess_prepare_for_pipelining //chess_loop_range(7, )
+ // e.g. 28/4 = 7
+ // 13 cycles delay for vload.
+ // 7 gives us 3 cycle inner loop.
+ // 13 gave 1 cycle inner loop before partial load, not it only gets
+ // 2 cycles (not sure why?)
+ {
+ aie::vector in_a =
+ aie::load_v<32>(input0 + input_offset1);
+ input_offset1 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset1 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+ // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads) For ic > 8, we would load the next 64 weights
+ // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+ // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++)
+ // chess_prepare_for_pipelining //chess_loop_range(7, )
+ // e.g. 28/4 = 7
+ // 13 cycles delay for vload.
+ // 7 gives us 3 cycle inner loop.
+ // 13 gave 1 cycle inner loop before partial load, not it only gets
+ // 2 cycles (not sure why?)
+ {
+ aie::vector in_a =
+ aie::load_v<32>(input1 + input_offset2);
+ input_offset2 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset2 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ vec_conv[x8] = acc_tmp[x8].to_vector(scaleT);
+ }
+ } // conv section
+
+ { // skip section
+ MMULi4x8x8 acci_tmp[NUM_ACC];
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ acci_tmp[x8] = aie::zeros();
+ }
+
+ for (int ic = 0; ic < (input_channels_skip / 8); ic++) {
+ // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels_skip);
+ kernels_skip += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(skip + input_offset3);
+ input_offset3 += 32; // act oc0..3(ic0..7)
+ acci_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset3 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ vec_skip[x8] = acci_tmp[x8].to_vector(scaleT_skip_conv);
+ }
+ } // skip section
+
+ // input ptr just moves to next section
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::accum accj;
+ accj.from_vector(vec_conv[x8], 0);
+ accj = aie::add(accj, vec_skip[x8]);
+ aie::vector o1 = accj.to_vector(skip_scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ }
+ input_offset1 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset2 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset3 -=
+ ((input_channels_skip / 8) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ kernels_skip -= (input_channels_skip / 8) *
+ 64; // reset kernel back to beginning of ic/8
+ } // for(int x=0; x skip1 = aie::load_v<32>(skip_ptr);
+ // skip_ptr += 32; aie::vector skip1 =
+ // aie::load_v<32>(skip_ptr); skip_ptr += 32;
+ // // aie::vector tmp = aie::load_v<32>(out_ptr);
+ // aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // i_out_ptr += 32; aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 =
+ // accj.to_vector(skip_scaleT); aie::store_v(out_ptr,
+ // o3); out_ptr += 32;
+ // }
+ // }
+ // out_ptr += (iw_32_rem*32);
+ // skip_ptr += (iw_32_rem*32);
+ // }
+
+ out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+ skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+ } // if(iw_32 > 0) {
+
+ // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+ // if(iw_32_rem > 0) {
+
+ // const int ocs = output_channels;
+ // const int ics = input_channels;
+
+ // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+ // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+ // for(int oc=0; oc<(ocs/8); oc++) {
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position, TODO -(iw_32_rem*8)??
+ // }
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position
+ // }
+ // // input ptr just moves to next section
+ // for(int xx=0; xx o1 = acc_tmp[xx].to_vector(scaleT);
+ // aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ // // aie::store_v(out_ptr, o1); out_ptr += 32;
+ // aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+ // acc_tmp[xx] = aie::zeros();
+ // }
+ // // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+ // of input ptr for remainder input_offset1 -= 448; // reset to
+ // beginning of input ptr for remainder input_offset2 -= 448; // reset
+ // to beginning of input ptr for remainder
+ // // kernel ptr already at next oc/8
+ // i_out_ptr += (iw*8)-(iw_32_rem*32); // move to next oc/8
+ // (skip remainder section if present)
+ // }
+
+ // i_out_ptr -= output_channels*iw;
+
+ // for(int oc=0; oc<(output_channels/8); oc++) {
+ // for(int x8=0; x8 skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+ // 32; aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 = accj.to_vector(skip_scaleT);
+ // aie::store_v(out_ptr, o3); out_ptr += 32;
+ // }
+ // out_ptr += (iw*8)-(iw_32_rem*32);
+ // skip_ptr += (iw*8)-(iw_32_rem*32);
+ // }
+
+ // } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 1x1 skip init - vector
+// act: uint8, wts: int8, skip: uint8, out: uint8
+//*****************************************************************************
+void conv2dk1_skip_init_ui8_vector(
+ uint8_t *input0, uint8_t *input1, int8_t *kernels, uint8_t *output,
+ uint8_t *skip, const int32_t input_width, const int32_t input_channels,
+ const int32_t output_channels, const int32_t input_channels_skip,
+ const int scale, const int skip_scale, const int scale_skip_conv)
+
+{
+ event0();
+
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ // using MMULi4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ uint8_t * /*restrict*/ out_ptr = output;
+ int8_t *i_out_ptr = (int8_t *)output;
+ // uint8_t * restrict skip_ptr = skip;
+ uint8_t *restrict skip_ptr = skip;
+
+ const int wts_offset = output_channels * input_channels;
+ int8_t *kernels_skip = kernels + wts_offset;
+
+ const int scaleT = scale;
+ const int skip_scaleT = skip_scale;
+ const int scaleT_skip_conv = scale_skip_conv;
+
+ constexpr int NUM_ACC = 8;
+
+ const int iw_32 = (input_width / 4) / 8;
+ const int iw = input_width;
+ const int iw_32_rem = (input_width / 4) % 8;
+
+ int input_offset1 = 0;
+ int input_offset2 = 0;
+ int input_offset3 = 0;
+
+ // aie::vector vec_tmp[NUM_ACC];
+
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int x = 0; x < iw_32; x++) {
+ aie::vector vec_conv[NUM_ACC];
+ aie::vector vec_skip[NUM_ACC];
+
+ MMUL4x8x8 acc_tmp[NUM_ACC];
+ { // conv section
+ // MMUL4x8x8 acc_tmp[NUM_ACC];
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ acc_tmp[x8] = aie::zeros();
+ }
+
+ for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+ // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads) For ic > 8, we would load the next 64 weights
+ // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+ // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++)
+ // chess_prepare_for_pipelining //chess_loop_range(7, )
+ // e.g. 28/4 = 7
+ // 13 cycles delay for vload.
+ // 7 gives us 3 cycle inner loop.
+ // 13 gave 1 cycle inner loop before partial load, not it only gets
+ // 2 cycles (not sure why?)
+ {
+ aie::vector in_a =
+ aie::load_v<32>(input0 + input_offset1);
+ input_offset1 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset1 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int ic = 0; ic < (input_channels / 16); ic++) { // half ic/8
+ // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads) For ic > 8, we would load the next 64 weights
+ // that are ic8..15(oc0..7) For oc > 8, we would load the next 64
+ // weights after all the ic weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels);
+ kernels += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++)
+ // chess_prepare_for_pipelining //chess_loop_range(7, )
+ // e.g. 28/4 = 7
+ // 13 cycles delay for vload.
+ // 7 gives us 3 cycle inner loop.
+ // 13 gave 1 cycle inner loop before partial load, not it only gets
+ // 2 cycles (not sure why?)
+ {
+ aie::vector in_a =
+ aie::load_v<32>(input1 + input_offset2);
+ input_offset2 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset2 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ vec_conv[x8] = acc_tmp[x8].to_vector(scaleT);
+ }
+ } // conv section
+
+ { // skip section
+ // MMULi4x8x8 acci_tmp[NUM_ACC];
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ acc_tmp[x8] = aie::zeros();
+ }
+
+ for (int ic = 0; ic < (input_channels_skip / 8); ic++) {
+ // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8}
+ aie::vector in_b = aie::load_v<64>(kernels_skip);
+ kernels_skip += 64; // wts ic0..7(oc0..7)
+
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::vector in_a =
+ aie::load_v<32>(skip + input_offset3);
+ input_offset3 += 32; // act oc0..3(ic0..7)
+ acc_tmp[x8].mac(in_a, in_b);
+ }
+ input_offset3 +=
+ (iw * 8) -
+ 256; // Move to next ic/8 position. 256 = 32 input * 8 ic
+ }
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ vec_skip[x8] = acc_tmp[x8].to_vector(scaleT_skip_conv);
+ }
+ } // skip section
+
+ // input ptr just moves to next section
+ for (int x8 = 0; x8 < NUM_ACC; x8++) {
+ aie::accum accj;
+ accj.from_vector(vec_conv[x8], 0);
+ accj = aie::add(accj, vec_skip[x8]);
+ aie::vector o1 = accj.to_vector(skip_scaleT);
+ aie::store_v(out_ptr, o1);
+ out_ptr += 32;
+ }
+ input_offset1 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset2 -=
+ ((input_channels / 16) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ input_offset3 -=
+ ((input_channels_skip / 8) * iw * 8) -
+ 256; // reset to next input_width/32 block. 256 = 32 input * 8 ic
+ kernels -=
+ (input_channels / 8) * 64; // reset kernel back to beginning of ic/8
+ kernels_skip -= (input_channels_skip / 8) *
+ 64; // reset kernel back to beginning of ic/8
+ } // for(int x=0; x skip1 = aie::load_v<32>(skip_ptr);
+ // skip_ptr += 32; aie::vector skip1 =
+ // aie::load_v<32>(skip_ptr); skip_ptr += 32;
+ // // aie::vector tmp = aie::load_v<32>(out_ptr);
+ // aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // i_out_ptr += 32; aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 =
+ // accj.to_vector(skip_scaleT); aie::store_v(out_ptr,
+ // o3); out_ptr += 32;
+ // }
+ // }
+ // out_ptr += (iw_32_rem*32);
+ // skip_ptr += (iw_32_rem*32);
+ // }
+
+ out_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+ skip_ptr -= (output_channels - 1) * iw + (iw_32_rem * 32);
+
+ } // if(iw_32 > 0) {
+
+ // **TODO** Move out_ptr and skip_ptr back to first oc/8 rem location
+
+ // if(iw_32_rem > 0) {
+
+ // const int ocs = output_channels;
+ // const int ics = input_channels;
+
+ // input_offset1 = 0; // TODO need to offset this to ic_32_rem position
+ // input_offset2 = 0; // TODO need to offset this to ic_32_rem position
+
+ // for(int oc=0; oc<(ocs/8); oc++) {
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input0+input_offset1); input_offset1 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset1 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position, TODO -(iw_32_rem*8)??
+ // }
+ // for(int ic=0; ic<(ics/16); ic++) {
+ // // For ic = oc = 8, we can load all the weights in 1x 512b vec reg
+ // (2x 256b loads)
+ // // For ic > 8, we would load the next 64 weights that are
+ // ic8..15(oc0..7)
+ // // For oc > 8, we would load the next 64 weights after all the ic
+ // weights {OC}{IC}{IC8}{OC8} aie::vector in_b =
+ // aie::load_v<64>(kernels); kernels+=64; // wts ic0..7(oc0..7)
+
+ // for(int x=0; x in_a =
+ // aie::load_v<32>(input1+input_offset2); input_offset2 += 32; //
+ // act oc0..3(ic0..7) acc_tmp[x].mac(in_a, in_b);
+ // }
+ // input_offset2 += (iw*8)-(iw_32_rem*32); // Move to next ic/8
+ // position
+ // }
+ // // input ptr just moves to next section
+ // for(int xx=0; xx o1 = acc_tmp[xx].to_vector(scaleT);
+ // aie::vector o1 = acc_tmp[xx].to_vector(scaleT);
+ // // aie::store_v(out_ptr, o1); out_ptr += 32;
+ // aie::store_v(i_out_ptr, o1); i_out_ptr += 32;
+ // acc_tmp[xx] = aie::zeros();
+ // }
+ // // input -= ((ics-1)/8)*(iw*8)+(iw_32_rem*32); // reset to beginning
+ // of input ptr for remainder input_offset1 -= 448; // reset to
+ // beginning of input ptr for remainder input_offset2 -= 448; // reset
+ // to beginning of input ptr for remainder
+ // // kernel ptr already at next oc/8
+ // i_out_ptr += (iw*8)-(iw_32_rem*32); // move to next oc/8
+ // (skip remainder section if present)
+ // }
+
+ // i_out_ptr -= output_channels*iw;
+
+ // for(int oc=0; oc<(output_channels/8); oc++) {
+ // for(int x8=0; x8 skip1 = aie::load_v<32>(skip_ptr); skip_ptr +=
+ // 32; aie::vector tmp = aie::load_v<32>(i_out_ptr);
+ // aie::accum accj;
+ // accj.from_vector(skip1,0);
+ // accj = aie::mac(accj, tmp, (uint8_t)1);
+ // aie::vector o3 = accj.to_vector(skip_scaleT);
+ // aie::store_v(out_ptr, o3); out_ptr += 32;
+ // }
+ // out_ptr += (iw*8)-(iw_32_rem*32);
+ // skip_ptr += (iw*8)-(iw_32_rem*32);
+ // }
+
+ // } // if(iw_32_rem > 0)
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+//*****************************************************************************
+// conv2d 1x1 skip init wrappers
+//*****************************************************************************
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t input_channels_skip, const int scale,
+ const int skip_scale, const int scale_skip_conv) {
+ conv2dk1_skip_init_i8_scalar(
+ input0, input1, kernels, output, skip, input_width, input_channels,
+ output_channels, input_channels_skip, scale, skip_scale, scale_skip_conv);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t input_channels_skip, const int scale,
+ const int skip_scale, const int scale_skip_conv) {
+ // conv2dk1_skip_init_ui8_scalar(input0, input1, kernels, output, skip,
+ // input_width, input_channels, output_channels, input_channels_skip, scale,
+ // skip_scale, scale_skip_conv);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t input_channels_skip, const int scale,
+ const int skip_scale, const int scale_skip_conv) {
+ conv2dk1_skip_init_i8_vector(
+ input0, input1, kernels, output, skip, input_width, input_channels,
+ output_channels, input_channels_skip, scale, skip_scale, scale_skip_conv);
+}
+
+#else // UINT8_ACT
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t input_channels_skip, const int scale,
+ const int skip_scale, const int scale_skip_conv) {
+ // conv2dk1_skip_init_ui8_vector(input0, input1, kernels, output, skip,
+ // input_width, input_channels, output_channels, input_channels_skip, scale,
+ // skip_scale, scale_skip_conv);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+} // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk1_skip_init.h b/aie_kernels/aie2/conv2dk1_skip_init.h
new file mode 100755
index 0000000000..cfb4b8b467
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk1_skip_init.h
@@ -0,0 +1,33 @@
+//===- conv2dk1_skip_init.h -------------------------------------------------*-
+// C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK1_SKIP_INIT_H
+#define _CONV2DK1_SKIP_INIT_H
+
+extern "C" {
+
+void conv2dk1_skip_init_i8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, int8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale);
+
+void conv2dk1_skip_init_ui8(uint8_t *input0, uint8_t *input1, int8_t *kernels,
+ uint8_t *output, uint8_t *skip,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels, const int scale,
+ const int skip_scale);
+
+} // extern "C"
+
+#endif
diff --git a/aie_kernels/aie2/conv2dk3.cc b/aie_kernels/aie2/conv2dk3.cc
new file mode 100755
index 0000000000..e0f3d9e1b5
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk3.cc
@@ -0,0 +1,1434 @@
+//===- conv2dk3.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include
+#include
+#include
+
+#include
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+enum region { top, middle, bottom };
+
+#ifdef SCALAR
+
+const int32_t MAX = 255;
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - scalar
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_i8_scalar(int8_t *line0, int8_t *line1, int8_t *line2,
+ int8_t *wts, uint8_t *output, const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ event0();
+
+ int x, ki, ic, oc, ic8, oc8;
+ int32_t sum;
+ int sum_srs;
+ int wts_indx_0 = 0, wts_indx_1 = 0, wts_indx_2 = 0;
+ int in_indx_0 = 0;
+ // for (oc = (0+channel_offset)/8; oc < (output_channels+channel_offset)/8;
+ // oc++) {
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ int oc_ofst = oc + (channel_offset / 8);
+ for (oc8 = 0; oc8 < 8; oc8++) {
+
+ // left border
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 1; ki < kernel_width; ki++) {
+
+ // replicate 1 border pixel on the left
+ // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+ if (ki == 0) {
+ // in_indx_0=0+ki+input_width*ic;
+ in_indx_0 = (0 + ki) * 8 + ((ic * input_width * 8) + ic8);
+ } else {
+ // in_indx_0=0+ki-1+input_width*ic;
+ in_indx_0 = (0 + ki - 1) * 8 + ((ic * input_width * 8) + ic8);
+ }
+
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ // output[oc * (input_width) + 0] = sum;
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ // output[oc * input_width + 0] = sum_srs;
+ output[(oc * input_width * 8) + oc8] = sum_srs;
+
+ // right border
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 0; ki < kernel_width - 1; ki++) {
+ // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+ if (ki != 2) {
+ // in_indx_0=input_width-2+ki+input_width*ic;
+ in_indx_0 =
+ (input_width - 2 + ki) * 8 + ((ic * input_width * 8) + ic8);
+ } else { // replicate 1 border pixel on the right
+ // in_indx_0=input_width-2+ki-1+input_width*ic;
+ in_indx_0 = (input_width - 2 + ki - 1) * 8 +
+ ((ic * input_width * 8) + ic8);
+ }
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ // output[oc * input_width + input_width-1] = sum_srs;
+ output[(oc * input_width * 8) + (input_width - 1) * 8 + oc8] = sum_srs;
+ // output[oc * (input_width) + input_width-1] = sum;
+
+ for (x = 1; x < input_width - 1; x++) { // col of output image
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 0; ki < kernel_width; ki++) {
+ // wts format - orig is oc,ic,ky,kx, reformat is
+ // oc,ic,k0..k8,ic8,oc8
+
+ // int wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; int wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc; int
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+
+ // int in_indx_0=x-1+ki+input_width*ic;
+ int in_indx_0 = (x - 1 + ki) * 8 + ((ic * input_width * 8) + ic8);
+
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ output[(oc * input_width * 8) + x * 8 + oc8] = sum_srs;
+ // output[oc * (input_width) + x] = sum;
+ }
+ }
+ }
+
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - scalar
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_ui8_scalar(uint8_t *line0, uint8_t *line1, uint8_t *line2,
+ int8_t *wts, uint8_t *output,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t kernel_width,
+ const int32_t kernel_height, const int32_t check,
+ const int scale, const int channel_offset) {
+ event0();
+
+ int x, ki, ic, oc, ic8, oc8;
+ int32_t sum;
+ int sum_srs;
+ int wts_indx_0 = 0, wts_indx_1 = 0, wts_indx_2 = 0;
+ int in_indx_0 = 0;
+ // for (oc = (0+channel_offset)/8; oc < (output_channels+channel_offset)/8;
+ // oc++) {
+ for (oc = 0; oc < output_channels / 8; oc++) {
+ int oc_ofst = oc + (channel_offset / 8);
+ for (oc8 = 0; oc8 < 8; oc8++) {
+
+ // left border
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 1; ki < kernel_width; ki++) {
+
+ // replicate 1 border pixel on the left
+ // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+ if (ki == 0) {
+ // in_indx_0=0+ki+input_width*ic;
+ in_indx_0 = (0 + ki) * 8 + ((ic * input_width * 8) + ic8);
+ } else {
+ // in_indx_0=0+ki-1+input_width*ic;
+ in_indx_0 = (0 + ki - 1) * 8 + ((ic * input_width * 8) + ic8);
+ }
+
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ // output[oc * (input_width) + 0] = sum;
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ // output[oc * input_width + 0] = sum_srs;
+ output[(oc * input_width * 8) + oc8] = sum_srs;
+
+ // right border
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 0; ki < kernel_width - 1; ki++) {
+ // wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc;
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) + oc8;
+
+ if (ki != 2) {
+ // in_indx_0=input_width-2+ki+input_width*ic;
+ in_indx_0 =
+ (input_width - 2 + ki) * 8 + ((ic * input_width * 8) + ic8);
+ } else { // replicate 1 border pixel on the right
+ // in_indx_0=input_width-2+ki-1+input_width*ic;
+ in_indx_0 = (input_width - 2 + ki - 1) * 8 +
+ ((ic * input_width * 8) + ic8);
+ }
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ // output[oc * input_width + input_width-1] = sum_srs;
+ output[(oc * input_width * 8) + (input_width - 1) * 8 + oc8] = sum_srs;
+ // output[oc * (input_width) + input_width-1] = sum;
+
+ for (x = 1; x < input_width - 1; x++) { // col of output image
+ sum = 0;
+ sum_srs = 0;
+ for (ic = 0; ic < input_channels / 8; ic++) {
+ for (ic8 = 0; ic8 < 8; ic8++) {
+ for (ki = 0; ki < kernel_width; ki++) {
+ // wts format - orig is oc,ic,ky,kx, reformat is
+ // oc,ic,k0..k8,ic8,oc8
+
+ // int wts_indx_0=0*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc; int wts_indx_1=1*3 + ki +
+ // 3*kernel_width*ic + 3*kernel_width*input_channels*oc; int
+ // wts_indx_2=2*3 + ki + 3*kernel_width*ic +
+ // 3*kernel_width*input_channels*oc;
+ int wts_indx_0 =
+ (0 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+ int wts_indx_1 =
+ (1 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+ int wts_indx_2 =
+ (2 * 3 * 64) + (ki * 64) + (ic * 3 * kernel_width * 64) +
+ (ic8 * 8) +
+ (oc_ofst * (input_channels / 8) * 3 * kernel_width * 64) +
+ oc8;
+
+ // int in_indx_0=x-1+ki+input_width*ic;
+ int in_indx_0 = (x - 1 + ki) * 8 + ((ic * input_width * 8) + ic8);
+
+ if (check != top)
+ sum += line0[in_indx_0] * wts[wts_indx_0];
+ sum += line1[in_indx_0] * wts[wts_indx_1];
+ if (check != bottom)
+ sum += line2[in_indx_0] * wts[wts_indx_2];
+ }
+ }
+ }
+ sum_srs = (sum + (1 << (scale - 1))) >> scale;
+ sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
+ output[(oc * input_width * 8) + x * 8 + oc8] = sum_srs;
+ // output[oc * (input_width) + x] = sum;
+ }
+ }
+ }
+
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - vector
+// act: int8, wts: int8, out: uint8
+//*****************************************************************************
+void conv2dk3_i8_vector(int8_t *line0, int8_t *line1, int8_t *line2,
+ int8_t *wts, uint8_t *output, const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ event0();
+
+ // Compute
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, int8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ constexpr unsigned VecFactor = 16;
+
+ // const int scale = 11;
+
+ // basic MMUL intrinisic needed is k x ic x oc
+ // k is number of inputs processed at a time
+ // So if ic=8, oc=4, then k=8 and we use 8x8x4
+ const unsigned k =
+ 256 / (input_channels * output_channels); // 8 inputs per vector output
+
+ aie::vector zero32 = aie::zeros();
+
+ // aie::vector prev_a[3],
+ // aie::vector in_a;
+ // aie::vector in_b;
+ // aie::vector tmp_a;
+ // aie::vector tmp_a1, tmp_a2;
+
+ // int8_t * restrict line[3];
+ int8_t *line[3];
+ line[0] = line0;
+ line[1] = line1;
+ line[2] = line2;
+
+ // int8_t * restrict wtsLine[3];
+ int8_t *wtsLine[3];
+ // oc,ic,ky,kx,ic8,oc8
+ wtsLine[0] = wts + (channel_offset / 8) * (input_channels / 8) *
+ kernel_height * kernel_width * 64;
+ wtsLine[1] = wts +
+ (channel_offset / 8) * (input_channels / 8) * kernel_height *
+ kernel_width * 64 +
+ kernel_width * 64; // next kernel line is always 8*8 away
+ wtsLine[2] = wts +
+ (channel_offset / 8) * (input_channels / 8) * kernel_height *
+ kernel_width * 64 +
+ 2 * kernel_width * 64; // next kernel line is always 8*8 away
+
+ MMUL4x8x8 acc_tmp[8];
+
+ // Zero accumulators used for storing partial results
+ // for(int x=0; x();
+ }
+
+ // TODO temporary workaround. When assigned to input_width, the results are
+ // wrong. ???
+ const int iw = 32;
+ // const int32_t iw = input_width;
+
+ // const int iw_32 = ((input_width/4)-2)/8;
+ // const int iw_32 = ((iw/4)-2)/8;
+ // const int iw_32 = ((32/4)-2)/8;
+ const int iw_32 = 0;
+
+ // const int iw_32_rem = ((input_width/4)-2) % 8;
+ // const int iw_32_rem = ((iw/4)-2) % 8;
+ // const int iw_32_rem = ((32/4)-2) % 8;
+ const int iw_32_rem = 6;
+
+ // output += (channel_offset*iw); // channel_offset/8*iw*8
+
+ int kernel_height_start;
+ int kernel_height_end;
+
+ // int kernel_height_start, kernel_height_end;
+#ifdef BORDER_REPLICATE
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height;
+ // constexpr int kernel_height_start = 0;
+ // constexpr int kernel_height_end = kernel_height;
+#else // Zero border for 3x3
+ // constexpr int kernel_height_start = 0;
+ // constexpr int kernel_height_end = kernel_height-1;
+
+ // if(check == top)
+ // idx_adj = 1;
+
+ // We skip top or bottom row for zero border
+ switch (check) {
+ case top:
+ kernel_height_start = 1;
+ kernel_height_end = kernel_height;
+ break;
+ case middle:
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height;
+ break;
+ case bottom:
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height - 1;
+ break;
+ }
+#endif
+
+ // --------------------------------------------------------------------
+ // Leftmost pattern
+ // --------------------------------------------------------------------
+ // Computes leftmost 4 inputs for all input/output channels.
+ // This shifts the leftmost input data by 1 (x8 channels) for 3x3 to
+ // account for border. Border replicate copies the leftmost input while
+ // 0 border shifts in 0's. If we need to support larger than 3x3, the
+ // replicate logic would need to be changed.
+ // --------------------------------------------------------------------
+ {
+ // in_b = aie::load_v<64>(wtsLine[kernel_height_start]);
+ // wtsLine[kernel_height_start] +=64; // wts ic0..7(oc0..7)
+
+ MMUL4x8x8 acc1 = aie::zeros();
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, )
+ // chess_unroll_loop()
+ {
+ // aie::vector tmp_a1, tmp_a2;
+ // Load input data [a0 a1 a2 a3 a4 a5 a6 a7] where each position has
+ // data for 8 channels
+ auto tmp_a1 = aie::load_v<32>(line[i]);
+ line[i] += 32; // act 0..3 (ic0..7 for each)
+ auto tmp_a2 =
+ aie::load_v<32>(line[i]); // act 4..7 (ic0..7 for each)
+ auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+#ifdef BORDER_REPLICATE
+ tmp_a1 = aie::shuffle_up(tmp_a1, 24);
+ tmp_a.insert<32>(1, tmp_a1);
+#else
+ tmp_a = aie::zeros();
+#endif
+ // Shift right 1 input (8 channels) [- a0 a1 a2 a3 a4 a5 a6] where -
+ // is either a0 or 0's
+ in_a = aie::shuffle_up_fill(in_a, tmp_a, 8);
+
+ // Previous buffer stores shifted data, [- - - - a0 a1 a2 a3]
+ // where - is
+ // prev_a[i] = aie::shuffle_up(in_a, 24); // Shift right (4-1)*8
+
+ // prev_a[i] = in_a;
+ // prev_a[i] = aie::shuffle_up(prev_a[i], 24); // Shift right
+ // (4-1)*8
+
+ // For kernel width, we load 64 weights (8 ics x 8 ocs) and multiply
+ // it with the act buffer. acc[32] += in_a[32] * wts[64] We then
+ // shift the buffer left by 1 data position (8 channels).
+ for (int j = 0; j < kernel_width; j++)
+ // chess_unroll_loop()
+ {
+ auto in_b = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64; // wts ic0..7(oc0..7)
+ acc1.mac(in_a.extract<32>(0), in_b);
+ // Shift input A by 1 row (1x8) which is by 1 (the 8 is the ic=8)
+ in_a = aie::shuffle_down(in_a, 8);
+ }
+ wtsLine[i] -=
+ (kernel_width * 64); // Reset weight pointer for this line
+ // wtsLine[i] += ((kernel_height-1)*kernel_width*64); // Move to
+ // next ic/8 position No need to load next set of weights because
+ // next row of weights immediately follows line[i] += (iw*4)*8; //
+ // Increment to next ic/8 position (reset at end of outermost loop)
+ } // for(int i=kernel_height_start; i o1 = acc1.to_vector(scale);
+ aie::store_v(output, o1);
+ output += iw * 8; // Shift to next oc/8 offset for left side
+
+ acc1 = aie::zeros();
+
+ // Shift back to beginning of input
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ line[i] -= (input_channels / 8) * (iw * 8);
+ }
+
+ } // for(int oc=0; oc<(output_channels/8); oc++) {
+
+ // Reset output to beginning, then add 4*8
+ // Reset wts to beginning of wts
+ // Reset line to beginning of input, then add 4*8
+ output -= (output_channels / 8) * (iw * 8) - 32;
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+ kernel_width * kernel_height *
+ 64; // kernel_width*kernel_height*8*8
+ // line[i] -= (output_channels/8)*(input_channels/8)*(iw*8)-32; //
+ line[i] += 32;
+ }
+ }
+
+ // --------------------------------------------------------------------
+ // Middle pattern
+ // --------------------------------------------------------------------
+ // The middle seciton algorithm is different because we want to minimize
+ // the reloading of weights and activations. So instead, we use up to 8
+ // accumulators to store partial products with activations being shifted.
+ // Then for the next kernel position, we reload weights.
+ //
+ // H,W,C8
+ // --------------------------------------------------------------------
+
+ // Main loop for when input_width/4-2 > 8
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++) {
+ for (int i = kernel_height_start; i < kernel_height_end;
+ i++) { // 1 to 3
+
+ for (int j = 0; j < kernel_width; j++) {
+ aie::vector wtsVec = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64;
+
+ // auto prev = prev_a[i].extract<32>(1); // prev
+ // = x0..x3(ci0..ci7)
+ auto prev = aie::load_v<32>((line[i] - 32));
+ auto curr = aie::load_v<32>((line[i]));
+ line[i] += 32;
+ auto next = aie::load_v<32>((line[i]));
+ line[i] += 32;
+
+ for (int x = 0; x < 8; x++)
+ // chess_unroll_loop()
+ {
+ auto tmp1 = aie::concat(curr, next);
+ auto tprev = aie::concat(zero32, prev);
+ auto tmp2 = aie::shuffle_up_fill(
+ tmp1, tprev, 8); // curr = x3..x6(ci0..ci7)
+ auto tmp3 = aie::shuffle_down(
+ tmp2, j * 8); // curr = x4..x7(ci0..ci7) to
+ // x5..x8(ci0..ci7)ss
+
+ prev = curr;
+ curr = next;
+ next = aie::load_v<32>(line[i]);
+ line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+ acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+ } // for(int x=0; x<8; x++)
+ line[i] -= 320; // (8+2)*32, Reset line buffer ptr to beginning of
+ // line (after first 4)
+ } // for(int j=0; j o1 = acc_tmp[x].to_vector(scale);
+ aie::store_v(output, o1);
+ output += 32;
+ acc_tmp[x] = aie::zeros();
+ }
+ // For next 8 activations, reset line buffer and weights
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ line[i] -=
+ (input_channels / 8) * (iw * 8); // length of act to shift back
+ }
+ } // for(int iw_32c=0; iw_32c 0)
+
+ // Secondary loop for input_width remainder (iw_32_rem < 8)
+ if (iw_32_rem > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++) {
+ for (int i = kernel_height_start; i < kernel_height_end;
+ i++) { // 1 to 3
+ for (int j = 0; j < kernel_width; j++) {
+ // New weight every kernel_width
+ aie::vector wtsVec = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64;
+ // auto prev = prev_a[i].extract<32>(1); // prev =
+ // x0..x3(ci0..ci7)
+ auto prev = aie::load_v<32>((line[i] - 32));
+ auto curr = aie::load_v<32>((line[i]));
+ line[i] += 32;
+ auto next = aie::load_v<32>((line[i]));
+ line[i] += 32;
+
+ for (int x = 0; x < iw_32_rem; x++) // remainder input width < 8
+ // chess_unroll_loop()
+ {
+ auto tmp1 = aie::concat(curr, next);
+ auto tprev = aie::concat(zero32, prev);
+ auto tmp2 = aie::shuffle_up_fill(
+ tmp1, tprev, 8); // curr = x3..x6(ci0..ci7)
+ auto tmp3 = aie::shuffle_down(
+ tmp2,
+ j * 8); // curr = x3..x6(ci0..ci7) to x5..x8(ci0..ci7)ss
+
+ prev = curr;
+ curr = next;
+ next = aie::load_v<32>(line[i]);
+ line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+ acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+ }
+ line[i] -=
+ (iw_32_rem + 2) * 32; // Reset line buffer ptr to beginning of
+ // line (after first 4)
+ } // for(int j=0; j o1 = acc_tmp[x].to_vector(scale);
+ aie::store_v(output, o1);
+ output += 32;
+ acc_tmp[x] = aie::zeros(); // Reset accumulators
+ }
+ // Reset line ptr to beginning of input
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ line[i] -= (input_channels / 8) * (iw * 8);
+ }
+ // Output ptr should be in the right place (next oc/8)
+ output += (iw * 8) - (iw_32_rem * 32); // 32 = 4*8, shift to next oc/8
+ } // for(int oc=0; oc<(output_channels/8); oc++)
+ // Reset weights and line buffers for right side
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+ kernel_width * kernel_height *
+ 64; // kernel_width*kernel_height*8*8
+ line[i] +=
+ iw_32_rem * 32; // shift to beginnign of right data, iw_32_rem*4*8
+ }
+ // shift back so we're aligned with beginning of first oc/8 (rightmost 4
+ // data)
+ output -= (output_channels / 8) * (iw * 8) - (iw_32_rem * 32);
+
+ } // if (iw_32_rem > 0) {
+
+ // --------------------------------------------------------------------
+ // Right patterns
+ // --------------------------------------------------------------------
+ //
+ // --------------------------------------------------------------------
+ {
+ MMUL4x8x8 acc1 = aie::zeros();
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, )
+ // chess_unroll_loop()
+ {
+ // Load next set of data for input A (matrix row), need stride info
+ // or line1/2/3 pointer
+ // TODO, did not store previous so need to load it again
+ // in_a = aie::load_v<64>(line[i]-32);
+ auto tmp_a1 =
+ aie::load_v<32>(line[i] - 32); // act 24..27 (ic0..7 for each)
+ auto tmp_a2 =
+ aie::load_v<32>(line[i]); // act 28..31 (ic0..7 for each)
+ auto in_a = aie::concat(tmp_a1, tmp_a2);
+#ifdef BORDER_REPLICATE
+ tmp_a2 = aie::shuffle_down(tmp_a2, 24);
+ tmp_a.insert<32>(0, tmp_a2);
+#else
+ auto tmp_a = aie::zeros();
+#endif
+ // shift by 32-8 (fill 32 then shift up by 8)
+ in_a = aie::shuffle_down_fill(in_a, tmp_a, 24); // act 27..31 - - -
+
+ for (int j = 0; j < kernel_width; j++)
+ // chess_unroll_loop()
+ {
+ auto in_b = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64; // wts ic0..7(oc0..7)
+ acc1.mac(in_a.extract<32>(0), in_b);
+ // Shift input A by 1 row (1x8) which is by 1 (the 8 is the ic=8)
+ in_a = aie::shuffle_down(in_a, 8);
+ }
+ wtsLine[i] += ((kernel_height - 1) * kernel_width *
+ 64); // Move to next ic/8 position
+ // No need to load next set of weights because next row of weights
+ // immediately follows
+ line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+ // end of outermost loop)
+ } // for(int i=kernel_height_start; i o1 = acc1.to_vector(scale);
+ aie::store_v(output, o1);
+ output += iw * 8; // Shift to next oc/8
+
+ acc1 = aie::zeros();
+
+ for (int i = kernel_height_start; i < kernel_height_end; i++) {
+ line[i] -= (input_channels / 8) *
+ (iw * 8); // shift back to beginning of this section
+ }
+ } // for(int oc=0; oc<(output_channels/8); oc++) {
+ }
+ event1();
+}
+
+#else // UINT8_ACT
+
+//*****************************************************************************
+// conv2d 3x3 - vector
+// act: uint8, wts: int8, out: uint8
+//*****************************************************************************
+// Takes 3 input lines and computes 1 output line
+void conv2dk3_ui8_vector(uint8_t *line0, uint8_t *line1, uint8_t *line2,
+ int8_t *wts, uint8_t *output,
+ const int32_t input_width,
+ const int32_t input_channels,
+ const int32_t output_channels,
+ const int32_t kernel_width,
+ const int32_t kernel_height, const int32_t check,
+ const int scale, const int channel_offset) {
+ event0();
+
+ // Compute
+ using MMUL4x8x8 = aie::mmul<4, 8, 8, uint8, int8>;
+ ::aie::set_saturation(
+ aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+ ::aie::set_rounding(
+ aie::rounding_mode::positive_inf); // Needed to saturate properly to uint8
+
+ constexpr unsigned VecFactor = 16;
+
+ // const int scale = 11;
+
+ // basic MMUL intrinisic needed is k x ic x oc
+ // k is number of inputs processed at a time
+ // So if ic=8, oc=4, then k=8 and we use 8x8x4
+ const unsigned k =
+ 256 / (input_channels * output_channels); // 8 inputs per vector output
+
+ aie::vector zero32 = aie::zeros();
+
+ // aie::vector prev_a[3],
+ // aie::vector in_a;
+ // aie::vector tmp_a;
+ // aie::vector tmp_a1, tmp_a2;
+ // aie::vector in_b;
+
+ uint8_t *restrict line[3];
+ // uint8_t *line[3];
+ line[0] = line0;
+ line[1] = line1;
+ line[2] = line2;
+
+ int8_t *restrict wtsLine[3];
+ // int8_t *wtsLine[3];
+ // oc,ic,ky,kx,ic8,oc8
+ wtsLine[0] = wts + (channel_offset / 8) * (input_channels / 8) *
+ kernel_height * kernel_width * 64;
+ wtsLine[1] = wts +
+ (channel_offset / 8) * (input_channels / 8) * kernel_height *
+ kernel_width * 64 +
+ kernel_width * 64; // next kernel line is always 8*8 away
+ wtsLine[2] = wts +
+ (channel_offset / 8) * (input_channels / 8) * kernel_height *
+ kernel_width * 64 +
+ 2 * kernel_width * 64; // next kernel line is always 8*8 away
+
+ MMUL4x8x8 acc_tmp[8];
+
+ // Zero accumulators used for storing partial results
+ // for(int x=0; x();
+ }
+
+ // TODO temporary workaround. When assigned to input_width, the results are
+ // wrong. ???
+ const int iw = 32;
+ // const int32_t iw = input_width;
+
+ // const int iw_32 = ((input_width/4)-2)/8;
+ // const int iw_32 = ((iw/4)-2)/8;
+ // const int iw_32 = ((32/4)-2)/8;
+ const int iw_32 = 0;
+
+ // const int iw_32_rem = ((input_width/4)-2) % 8;
+ // const int iw_32_rem = ((iw/4)-2) % 8;
+ // const int iw_32_rem = ((32/4)-2) % 8;
+ const int iw_32_rem = 6;
+
+ // output += (channel_offset*iw); // channel_offset/8*iw*8
+
+ int kernel_height_start;
+ int kernel_height_end;
+
+ // int kernel_height_start, kernel_height_end;
+#ifdef BORDER_REPLICATE
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height;
+ // constexpr int kernel_height_start = 0;
+ // constexpr int kernel_height_end = kernel_height;
+#else // Zero border for 3x3
+ // constexpr int kernel_height_start = 0;
+ // constexpr int kernel_height_end = kernel_height-1;
+
+ // if(check == top)
+ // idx_adj = 1;
+
+ // We skip top or bottom row for zero border
+ switch (check) {
+ case top:
+ kernel_height_start = 1;
+ kernel_height_end = kernel_height;
+ break;
+ case middle:
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height;
+ break;
+ case bottom:
+ kernel_height_start = 0;
+ kernel_height_end = kernel_height - 1;
+ break;
+ }
+#endif
+
+ // --------------------------------------------------------------------
+ // Leftmost pattern
+ // --------------------------------------------------------------------
+ // Computes leftmost 4 inputs for all input/output channels.
+ // This shifts the leftmost input data by 1 (x8 channels) for 3x3 to
+ // account for border. Border replicate copies the leftmost input while
+ // 0 border shifts in 0's. If we need to support larger than 3x3, the
+ // replicate logic would need to be changed.
+ // --------------------------------------------------------------------
+ {
+ // in_b = aie::load_v<64>(wtsLine[kernel_height_start]);
+ // wtsLine[kernel_height_start] +=64; // wts ic0..7(oc0..7)
+
+ MMUL4x8x8 acc1 = aie::zeros();
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_loop_range(2, ) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, )
+ // chess_unroll_loop()
+ {
+ // Load input data [a0 a1 a2 a3 a4 a5 a6 a7] where each position
+ // has data for 8 channels
+ auto tmp_a1 = aie::load_v<32>(line[i]);
+ line[i] += 32; // act 0..3 (ic0..7 for each)
+ auto tmp_a2 =
+ aie::load_v<32>(line[i]); // act 4..7 (ic0..7 for each)
+ auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+ aie::vector tmp_a;
+#ifdef BORDER_REPLICATE
+ tmp_a1 = aie::shuffle_up(tmp_a1, 24);
+ tmp_a.insert<32>(1, tmp_a1);
+#else
+ tmp_a = aie::zeros();
+#endif
+ // Shift right 1 input (8 channels) [- a0 a1 a2 a3 a4 a5 a6] where
+ // - is either a0 or 0's
+ in_a = aie::shuffle_up_fill(in_a, tmp_a, 8);
+
+ // Previous buffer stores shifted data, [- - - - a0 a1 a2 a3]
+ // where - is
+ // prev_a[i] = aie::shuffle_up(in_a, 24); // Shift right (4-1)*8
+
+ // prev_a[i] = in_a;
+ // prev_a[i] = aie::shuffle_up(prev_a[i], 24); // Shift right
+ // (4-1)*8
+
+ // For kernel width, we load 64 weights (8 ics x 8 ocs) and
+ // multiply it with the act buffer. acc[32] += in_a[32] * wts[64]
+ // We then shift the buffer left by 1 data position (8 channels).
+ for (int j = 0; j < kernel_width; j++)
+ chess_loop_range(3, 3) // TODO Assume 3x3
+ chess_unroll_loop() {
+ auto in_b = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64; // wts ic0..7(oc0..7)
+ acc1.mac(in_a.extract<32>(0), in_b);
+ // Shift input A by 1 row (1x8) which is by 1 (the 8 is the
+ // ic=8)
+ in_a = aie::shuffle_down(in_a, 8);
+ }
+ wtsLine[i] -=
+ (kernel_width * 64); // Reset weight pointer for this line
+ // wtsLine[i] += ((kernel_height-1)*kernel_width*64); // Move to
+ // next ic/8 position No need to load next set of weights because
+ // next row of weights immediately follows line[i] += (iw*4)*8; //
+ // Increment to next ic/8 position (reset at end of outermost
+ // loop)
+ } // for(int i=kernel_height_start; i o1 = acc1.to_vector(scale);
+ aie::store_v(output, o1);
+ output += iw * 8; // Shift to next oc/8 offset for left side
+
+ acc1 = aie::zeros();
+
+ // Shift back to beginning of input
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_loop_range(2, ) { line[i] -= (input_channels / 8) * (iw * 8); }
+
+ } // for(int oc=0; oc<(output_channels/8); oc++) {
+
+ // Reset output to beginning, then add 4*8
+ // Reset wts to beginning of wts
+ // Reset line to beginning of input, then add 4*8
+ output -= (output_channels / 8) * (iw * 8) - 32;
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_loop_range(2, ) {
+ wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+ kernel_width * kernel_height *
+ 64; // kernel_width*kernel_height*8*8
+ // line[i] -= (output_channels/8)*(input_channels/8)*(iw*8)-32; //
+ line[i] += 32;
+ }
+ }
+
+ // --------------------------------------------------------------------
+ // Middle pattern
+ // --------------------------------------------------------------------
+ // The middle seciton algorithm is different because we want to minimize
+ // the reloading of weights and activations. So instead, we use up to 8
+ // accumulators to store partial products with activations being shifted.
+ // Then for the next kernel position, we reload weights.
+ //
+ // H,W,C8
+ // --------------------------------------------------------------------
+
+ // Main loop for when input_width/4-2 > 8
+ if (iw_32 > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int iw_32c = 0; iw_32c < iw_32; iw_32c++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_loop_range(2, ) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) { // 1 to 3
+
+ for (int j = 0; j < kernel_width; j++)
+ chess_loop_range(3, 3) // TODO Assume 3x3
+ chess_unroll_loop() {
+ aie::vector wtsVec = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64;
+
+ // auto prev = prev_a[i].extract<32>(1);
+ // prev
+ // = x0..x3(ci0..ci7)
+ auto prev = aie::load_v<32>((line[i] - 32));
+ auto curr = aie::load_v<32>((line[i]));
+ line[i] += 32;
+ auto next = aie::load_v<32>((line[i]));
+ // line[i] += 32;
+
+ auto tprev = aie::concat(zero32, prev);
+ auto tmp1 = aie::concat(curr, next);
+
+ tmp1 = aie::shuffle_up_fill(
+ tmp1, tprev, 8); // curr = x3..x6(ci0..ci7)
+
+ tmp1 = aie::shuffle_down(
+ tmp1, j * 8); // curr = x4..x7(ci0..ci7) to
+
+ // j = 0, 1, 2
+ int j1 = j + 1; // 1, 2, 3
+ int j2 = j + 3 - (j >> 1) * 4; // 3, 4, 1
+ int lineIncr = (j >> 1) * 32; // 0, 0, 32
+
+ for (int x = 0; x < 8; x++)
+ chess_unroll_loop() chess_loop_range(8, 8) {
+ // auto tmp1 = aie::concat(curr, next);
+ // auto tprev = aie::concat(zero32, prev);
+ // auto tmp2 = aie::shuffle_up_fill(
+ // tmp1, tprev, 8); // curr = x3..x6(ci0..ci7)
+ // auto tmp3 = aie::shuffle_down(
+ // tmp2, j * 8); // curr = x4..x7(ci0..ci7) to
+ // // x5..x8(ci0..ci7)ss
+
+ // prev = curr;
+ // curr = next;
+ // next = aie::load_v<32>(line[i]);
+
+ // line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+ // acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+
+ acc_tmp[x].mac(tmp1.extract<32>(0), wtsVec);
+
+ tmp1 = aie::shuffle_down(tmp1, j1 * 8);
+ tmp1.insert(1, aie::load_v<32>(line[i] + lineIncr));
+ line[i] += 32;
+ tmp1 = aie::shuffle_down(tmp1, j2 * 8);
+
+ } // for(int x=0; x<8; x++)
+ line[i] -= 320; // (8+2)*32, Reset line buffer ptr to
+ // beginning of line (after first 4)
+ } // for(int j=0; j o1 = acc_tmp[x].to_vector(scale);
+ aie::store_v(output, o1);
+ output += 32;
+ acc_tmp[x] = aie::zeros();
+ }
+ // For next 8 activations, reset line buffer and weights
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ line[i] -=
+ (input_channels / 8) * (iw * 8); // length of act to shift back
+ }
+ } // for(int iw_32c=0; iw_32c 0)
+
+ // Secondary loop for input_width remainder (iw_32_rem < 8)
+ if (iw_32_rem > 0) {
+
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_loop_range(2, ) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) { // 1 to 3
+ for (int j = 0; j < kernel_width; j++)
+ chess_loop_range(3, 3) // TODO Assume 3x3
+ chess_unroll_loop() {
+ // New weight every kernel_width
+ aie::vector wtsVec = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64;
+ // auto prev = prev_a[i].extract<32>(1); //
+ // prev = x0..x3(ci0..ci7)
+ auto prev = aie::load_v<32>((line[i] - 32));
+ auto curr = aie::load_v<32>((line[i]));
+ line[i] += 32;
+ auto next = aie::load_v<32>((line[i]));
+ // line[i] += 32;
+
+ auto tprev = aie::concat(zero32, prev);
+ auto tmp1 = aie::concat(curr, next);
+
+ // j = 0, 1, 2
+ int jr0 = (2 - j) >> 1; // 1, 0, 0
+ int j0 = (j >> 1); // 0, 0, 1
+ int j1 = j + 1; // 1, 2, 3
+ int j2 = j + 3 - ((j >> 1) * 4); // 3, 4, 1
+ int lineIncr = (j >> 1) * 32; // 0, 0, 32
+
+ tmp1 = aie::shuffle_up_fill(
+ tmp1, tprev, jr0 * 8); // curr = x3..x6(ci0..ci7)
+
+ tmp1 = aie::shuffle_down(
+ tmp1, j0 * 8); // curr = x4..x7(ci0..ci7) to
+
+ for (int x = 0; x < iw_32_rem; x++) // remainder input width <
+ // 8 chess_unroll_loop()
+ chess_unroll_loop() {
+ // auto tmp1 = aie::concat(curr, next);
+ // auto tprev = aie::concat(zero32, prev);
+ // auto tmp2 = aie::shuffle_up_fill(
+ // tmp1, tprev, 8); // curr = x3..x6(ci0..ci7)
+ // auto tmp3 = aie::shuffle_down(
+ // tmp2,
+ // j * 8); // curr = x3..x6(ci0..ci7) to
+ // x5..x8(ci0..ci7)ss
+
+ // prev = curr;
+ // curr = next;
+ // next = aie::load_v<32>(line[i]);
+ // line[i] += 32; // next_prev = x4..x7(ci0..ci7)
+
+ // acc_tmp[x].mac(tmp3.extract<32>(0), wtsVec);
+ acc_tmp[x].mac(tmp1.extract<32>(0), wtsVec);
+
+ tmp1 = aie::shuffle_down(tmp1, j1 * 8);
+ tmp1.insert(1, aie::load_v<32>(line[i] + lineIncr));
+ line[i] += 32;
+ tmp1 = aie::shuffle_down(tmp1, j2 * 8);
+ }
+ line[i] -= (iw_32_rem + 1) *
+ 32; // Reset line buffer ptr to beginning of
+ // (iw_32_rem + 2) * 32; // Reset line buffer ptr to beginning
+ // of line (after first 4)
+ } // for(int j=0; j o1 = acc_tmp[x].to_vector(scale);
+ aie::store_v(output, o1);
+ output += 32;
+ acc_tmp[x] = aie::zeros(); // Reset accumulators
+ }
+ // Reset line ptr to beginning of input
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ line[i] -= (input_channels / 8) * (iw * 8);
+ }
+ // Output ptr should be in the right place (next oc/8)
+ output += (iw * 8) - (iw_32_rem * 32); // 32 = 4*8, shift to next oc/8
+ } // for(int oc=0; oc<(output_channels/8); oc++)
+ // Reset weights and line buffers for right side
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ wtsLine[i] -= (output_channels / 8) * (input_channels / 8) *
+ kernel_width * kernel_height *
+ 64; // kernel_width*kernel_height*8*8
+ line[i] +=
+ iw_32_rem * 32; // shift to beginnign of right data, iw_32_rem*4*8
+ }
+ // shift back so we're aligned with beginning of first oc/8 (rightmost 4
+ // data)
+ output -= (output_channels / 8) * (iw * 8) - (iw_32_rem * 32);
+
+ } // if (iw_32_rem > 0) {
+
+ // --------------------------------------------------------------------
+ // Right patterns
+ // --------------------------------------------------------------------
+ //
+ // --------------------------------------------------------------------
+ {
+ MMUL4x8x8 acc1 = aie::zeros();
+ for (int oc = 0; oc < (output_channels / 8); oc++) {
+ for (int ic = 0; ic < (input_channels / 8); ic++)
+ chess_loop_range(2, ) {
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, )
+ // chess_unroll_loop()
+ {
+ // Load next set of data for input A (matrix row), need stride
+ // info or line1/2/3 pointer
+ // TODO, did not store previous so need to load it again
+ // in_a = aie::load_v<64>(line[i]-32);
+ auto tmp_a1 =
+ aie::load_v<32>(line[i] - 32); // act 24..27 (ic0..7 for each)
+ auto tmp_a2 =
+ aie::load_v<32>(line[i]); // act 28..31 (ic0..7 for each)
+ auto in_a = aie::concat(tmp_a1, tmp_a2);
+
+ aie::vector tmp_a;
+#ifdef BORDER_REPLICATE
+ tmp_a2 = aie::shuffle_down(tmp_a2, 24);
+ tmp_a.insert<32>(0, tmp_a2);
+#else
+ tmp_a = aie::zeros();
+#endif
+ // shift by 32-8 (fill 32 then shift up by 8)
+ in_a =
+ aie::shuffle_down_fill(in_a, tmp_a, 24); // act 27..31 - - -
+
+ for (int j = 0; j < kernel_width; j++)
+ chess_loop_range(3, 3) chess_unroll_loop() {
+ auto in_b = aie::load_v<64>(wtsLine[i]);
+ wtsLine[i] += 64; // wts ic0..7(oc0..7)
+ acc1.mac(in_a.extract<32>(0), in_b);
+ // Shift input A by 1 row (1x8) which is by 1 (the 8 is the
+ // ic=8)
+ in_a = aie::shuffle_down(in_a, 8);
+ }
+ wtsLine[i] += ((kernel_height - 1) * kernel_width *
+ 64); // Move to next ic/8 position
+ // No need to load next set of weights because next row of weights
+ // immediately follows
+ line[i] += (iw * 8); // Increment to next ic/8 position (reset at
+ // end of outermost loop)
+ } // for(int i=kernel_height_start; i o1 = acc1.to_vector(scale);
+ aie::store_v(output, o1);
+ output += iw * 8; // Shift to next oc/8
+
+ acc1 = aie::zeros();
+
+ for (int i = kernel_height_start; i < kernel_height_end; i++)
+ chess_prepare_for_pipelining chess_loop_range(2, ) {
+ line[i] -= (input_channels / 8) *
+ (iw * 8); // shift back to beginning of this section
+ }
+ } // for(int oc=0; oc<(output_channels/8); oc++) {
+ }
+ event1();
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+
+extern "C" {
+
+#ifdef SCALAR
+
+#ifdef INT8_ACT
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ conv2dk3_i8_scalar(line0, line1, line2, wts, output, input_width,
+ input_channels, output_channels, kernel_width,
+ kernel_height, check, scale, channel_offset);
+}
+
+#else // UINT8_ACT
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ conv2dk3_ui8_scalar(line0, line1, line2, wts, output, input_width,
+ input_channels, output_channels, kernel_width,
+ kernel_height, check, scale, channel_offset);
+}
+
+#endif // UINT8_ACT
+
+#else // Vector
+
+#ifdef INT8_ACT
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ conv2dk3_i8_vector(line0, line1, line2, wts, output, input_width,
+ input_channels, output_channels, kernel_width,
+ kernel_height, check, scale, channel_offset);
+}
+
+#else // UINT8_ACT
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset) {
+ conv2dk3_ui8_vector(line0, line1, line2, wts, output, input_width,
+ input_channels, output_channels, kernel_width,
+ kernel_height, check, scale, channel_offset);
+}
+
+#endif // UINT8_ACT
+
+#endif // Vector
+}
\ No newline at end of file
diff --git a/aie_kernels/aie2/conv2dk3.h b/aie_kernels/aie2/conv2dk3.h
new file mode 100755
index 0000000000..61a2f8e698
--- /dev/null
+++ b/aie_kernels/aie2/conv2dk3.h
@@ -0,0 +1,33 @@
+//===- conv2dk3.h -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CONV2DK3_H
+#define _CONV2DK3_H
+
+extern "C" {
+
+void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset);
+
+void conv2dk3_ui8(uint8_t *line0, uint8_t *line1, uint8_t *line2, int8_t *wts,
+ uint8_t *output, const int32_t input_width,
+ const int32_t input_channels, const int32_t output_channels,
+ const int32_t kernel_width, const int32_t kernel_height,
+ const int32_t check, const int scale,
+ const int channel_offset);
+
+} // extern "C"
+
+#endif
diff --git a/programming_examples/ml/bottleneck/CMakeLists.txt b/programming_examples/ml/bottleneck/CMakeLists.txt
new file mode 100644
index 0000000000..4b897cb29c
--- /dev/null
+++ b/programming_examples/ml/bottleneck/CMakeLists.txt
@@ -0,0 +1,89 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
+set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+find_package(OpenCV REQUIRED)
+message("opencv library paht: ${OpenCV_LIB_PATH}")
+message("opencv libs: ${OpenCV_LIBS}")
+
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC
+ EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
+ EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
+ DISABLE_ABI_CHECK=1
+ )
+
+target_include_directories (${currentTarget} PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+ ${XRT_INC_DIR}
+ ${OpenCV_INCLUDE_DIRS}
+ ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${OpenCV_LIB_PATH}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ ${OpenCV_LIBS}
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ ${OpenCV_LIBS}
+ )
+endif()
diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile
new file mode 100755
index 0000000000..f5c6e4561f
--- /dev/null
+++ b/programming_examples/ml/bottleneck/Makefile
@@ -0,0 +1,40 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+include ../../makefile-common
+
+mlirFileName = aie
+
+all: build/conv2dk1.o build/conv2dk3.o build/conv2dk1_skip.o build/final.xclbin
+
+build/${mlirFileName}.mlir: aie2.py
+ mkdir -p ${@D}
+ python3 $< > $@
+
+insts.txt: build/${mlirFileName}.mlir
+ aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+
+build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
+ xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/conv2dk3.o: ../../../aie_kernels/aie2/conv2dk3.cc
+ xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
+
+build/conv2dk1_skip.o: ../../../aie_kernels/aie2/conv2dk1_skip.cc
+ xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
+
+build/final.xclbin: build/${mlirFileName}.mlir
+ cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+ --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+clean:
+ rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log .xclbin sim \
+ chess* *.o insts.txt \
+ *.log aie_partition.json *.bin BOOT.BIN _x test.exe
+
+run_py:
+ ${powershell} python3 test.py
diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md
new file mode 100644
index 0000000000..144b8e36f2
--- /dev/null
+++ b/programming_examples/ml/bottleneck/README.md
@@ -0,0 +1,125 @@
+
+
+# The Bottleneck Block
+## Introduction
+The bottleneck block is a key component in deep neural network architectures, such as ResNet. It is designed to help address the challenge of training very deep networks by reducing the computational cost while maintaining or improving performance. This README provides an overview of the process and considerations for accelerating a single bottleneck block.
+
+
+## Bottleneck Block Overview
+The components and functionality of a standard bottleneck block:
+
+* Identity Mapping: The core idea behind bottleneck blocks is the concept of identity mapping. Traditional neural network layers aim to learn a mapping from input to output. In contrast, a bottleneck block learns a residual mapping, which is the difference between the input and the output. The original input is then added back to this residual mapping to obtain the final output. Mathematically, this can be represented as `output = input+ residual.`
+
+* Convolutional Layers: Bottleneck blocks typically consist of one or more convolutional layers. These layers are responsible for learning features from the input data. Convolutional layers apply filters/kernels to the input feature maps to extract relevant patterns and features. The number of filters, kernel size, and other parameters can vary based on the specific architecture and requirements.
+
+* Activation Function: After each convolutional layer, an activation function is applied to introduce non-linearity into the network. Rectified Linear Unit (ReLU) is commonly used as the activation function due to its simplicity and effectiveness.
+
+* Batch Normalization: Batch normalization is often employed after convolutional layers to stabilize and accelerate the training process. It normalizes the activations of each layer, making optimization more robust and efficient.
+
+* Skip Connection (Identity Shortcut): This is the hallmark of bottleneck blocks. The skip connection directly passes the input from one layer to a later layer without any modification. It provides an alternative, shorter path for gradient flow during training. If the input and output dimensions of the bottleneck block are the same, the skip connection directly adds the input to the output. If the dimensions differ, the skip connection might include a 1x1 convolutional layer to adjust the dimensions accordingly.
+
+* Final Output: The final output of the bottleneck block is obtained by adding the input to the output of the convolutional layers (including any adjustments made to match dimensions via the skip connection).
+
+