From 9716dc78b8a3fe04a018b2739b261732f2e27264 Mon Sep 17 00:00:00 2001
From: James Lin <linjames@xilinx.com>
Date: Mon, 10 Jun 2024 18:44:17 -0500
Subject: [PATCH] [aievec] add aievec.min/max e2e to-llvm tests  (#1528)

---
 lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp  | 10 ++++---
 test/Conversion/AIEVecToLLVM/test-max.mlir    |  6 ++--
 test/Conversion/AIEVecToLLVM/test-min.mlir    |  6 ++--
 .../i16xi16_max_elem-llvm.mlir                | 28 +++++++++++++++++++
 .../i16xi16_max_elem/i16xi16_max_elem.mlir    | 10 +++++--
 .../i16xi16_max_elem/testbench.cc             | 16 +++++++++++
 .../i16xi16_min_elem-llvm.mlir                | 28 +++++++++++++++++++
 .../i16xi16_min_elem/i16xi16_min_elem.mlir    | 10 +++++--
 .../i16xi16_min_elem/testbench.cc             | 14 ++++++++++
 .../i32xi32_max_elem-llvm.mlir                | 28 +++++++++++++++++++
 .../i32xi32_max_elem/i32xi32_max_elem.mlir    | 10 +++++--
 .../i32xi32_max_elem/testbench.cc             | 16 +++++++++++
 .../i32xi32_min_elem-llvm.mlir                | 28 +++++++++++++++++++
 .../i32xi32_min_elem/i32xi32_min_elem.mlir    | 10 +++++--
 .../i32xi32_min_elem/testbench.cc             | 16 +++++++++++
 .../i8xi8_max_elem/i8xi8_max_elem-llvm.mlir   | 28 +++++++++++++++++++
 .../i8xi8_max_elem/i8xi8_max_elem.mlir        | 23 ++++++++-------
 .../aievec_tests/i8xi8_max_elem/testbench.cc  | 16 +++++++++++
 .../i8xi8_min_elem/i8xi8_min_elem-llvm.mlir   | 28 +++++++++++++++++++
 .../i8xi8_min_elem/i8xi8_min_elem.mlir        | 23 ++++++++-------
 .../aievec_tests/i8xi8_min_elem/testbench.cc  | 16 +++++++++++
 21 files changed, 328 insertions(+), 42 deletions(-)
 create mode 100644 test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir
 create mode 100644 test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir
diff --git a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
index 059c2101a4..6a0cb062da 100644
--- a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
+++ b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
@@ -1584,9 +1584,10 @@ class MaxOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::MaxOp> {
     // create xllvm intrinsic
     Value maxOp = nullptr;
     if (llvm::isa<IntegerType>(resultScaTy)) {
-      // create constant for cmp
+      // create constant for third operand `cmp`
+      // Note: `cmp` is implicitly treated as `sign` to the vmax intrinsic
       auto cmpCst = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
       SmallVector<Value> operands{adaptor.getLhs(), adaptor.getRhs(), cmpCst};
       if (resultBitWidth == 8) {
         maxOp = rewriter.create<xllvm::VectorMaxLt8IntrOp>(
@@ -1681,9 +1682,10 @@ class MinOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::MinOp> {
     // create xllvm intrinsic
     Value minOp = nullptr;
     if (llvm::isa<IntegerType>(resultScaTy)) {
-      // create constant for cmp
+      // create constant for third operand `cmp`
+      // Note: `cmp` is implicitly treated as `sign` to the vmin intrinsic
       auto cmpCst = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
       SmallVector<Value> operands{adaptor.getLhs(), adaptor.getRhs(), cmpCst};
       if (resultBitWidth == 8) {
         minOp = rewriter.create<xllvm::VectorMinGe8IntrOp>(
diff --git a/test/Conversion/AIEVecToLLVM/test-max.mlir b/test/Conversion/AIEVecToLLVM/test-max.mlir
index 59d4f49bcd..033ec75e71 100644
--- a/test/Conversion/AIEVecToLLVM/test-max.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-max.mlir
@@ -7,7 +7,7 @@ func.func @i8_max(%arg0 : vector<64xi8>) -> vector<64xi8> {
 
 // CHECK-LABEL: @i8_max
 // CHECK-SAME: %[[ARG0:.*]]: vector<64xi8>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt8"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<64xi8>, vector<64xi8>, i32) -> !llvm.struct<(vector<64xi8>, vector<2xi32>)>
@@ -23,7 +23,7 @@ func.func @i16_max(%arg0 : vector<32xi16>) -> vector<32xi16> {
 
 // CHECK-LABEL: @i16_max
 // CHECK-SAME: %[[ARG0:.*]]: vector<32xi16>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt16"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<32xi16>, vector<32xi16>, i32) -> !llvm.struct<(vector<32xi16>, i32)>
@@ -39,7 +39,7 @@ func.func @i32_max(%arg0 : vector<16xi32>) -> vector<16xi32> {
 
 // CHECK-LABEL: @i32_max
 // CHECK-SAME: %[[ARG0:.*]]: vector<16xi32>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt32"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<16xi32>, vector<16xi32>, i32) -> !llvm.struct<(vector<16xi32>, i32)>
diff --git a/test/Conversion/AIEVecToLLVM/test-min.mlir b/test/Conversion/AIEVecToLLVM/test-min.mlir
index 4930d639c9..595d759438 100644
--- a/test/Conversion/AIEVecToLLVM/test-min.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-min.mlir
@@ -7,7 +7,7 @@ func.func @i8_min(%arg0 : vector<64xi8>) -> vector<64xi8> {
 
 // CHECK-LABEL: @i8_min
 // CHECK-SAME: %[[ARG0:.*]]: vector<64xi8>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge8"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<64xi8>, vector<64xi8>, i32) -> !llvm.struct<(vector<64xi8>, vector<2xi32>)>
@@ -23,7 +23,7 @@ func.func @i16_min(%arg0 : vector<32xi16>) -> vector<32xi16> {
 
 // CHECK-LABEL: @i16_min
 // CHECK-SAME: %[[ARG0:.*]]: vector<32xi16>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge16"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<32xi16>, vector<32xi16>, i32) -> !llvm.struct<(vector<32xi16>, i32)>
@@ -39,7 +39,7 @@ func.func @i32_min(%arg0 : vector<16xi32>) -> vector<16xi32> {
 
 // CHECK-LABEL: @i32_min
 // CHECK-SAME: %[[ARG0:.*]]: vector<16xi32>
-// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge32"(
 // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : 
 // CHECK-SAME: (vector<16xi32>, vector<16xi32>, i32) -> !llvm.struct<(vector<16xi32>, i32)>
diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir
new file mode 100644
index 0000000000..c1d3d88390
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi16>, %arg1: memref<1024xi16>, %arg2: memref<1024xi16>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi16>
+    memref.assume_alignment %arg1, 32 : memref<1024xi16>
+    memref.assume_alignment %arg2, 32 : memref<1024xi16>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi16>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi16>
+      %2 = arith.maxsi %0, %1 : i16
+      affine.store %2, %arg2[%arg3] : memref<1024xi16>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir
index 1ae4491387..339ab0b0cc 100644
--- a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir
@@ -1,7 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc
index f8fe0a969f..60a6264401 100644
--- a/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc
@@ -4,7 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+
+#ifdef TO_CPP
 void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int16_t *in0_allocated, int16_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int16_t *in1_allocated,
+         int16_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int16_t *out0_allocated, int16_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
+
 void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0);
 
 alignas(32) int16_t g_in0[IN0_SIZE];
@@ -26,7 +38,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();
 
diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir
new file mode 100644
index 0000000000..dfdf3d3db0
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi16>, %arg1: memref<1024xi16>, %arg2: memref<1024xi16>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi16>
+    memref.assume_alignment %arg1, 32 : memref<1024xi16>
+    memref.assume_alignment %arg2, 32 : memref<1024xi16>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi16>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi16>
+      %2 = arith.minsi %0, %1 : i16
+      affine.store %2, %arg2[%arg3] : memref<1024xi16>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir
index c73cd2137d..377a4b42b5 100644
--- a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir
@@ -1,7 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc
index 9d7a4bd39d..35240fbda2 100644
--- a/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc
@@ -4,7 +4,17 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#ifdef TO_CPP
 void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int16_t *in0_allocated, int16_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int16_t *in1_allocated,
+         int16_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int16_t *out0_allocated, int16_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
 void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0);
 
 alignas(32) int16_t g_in0[IN0_SIZE];
@@ -26,7 +36,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();
 
diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir
new file mode 100644
index 0000000000..0e3c3590b3
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi32>
+    memref.assume_alignment %arg1, 32 : memref<1024xi32>
+    memref.assume_alignment %arg2, 32 : memref<1024xi32>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi32>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi32>
+      %2 = arith.maxsi %0, %1 : i32
+      affine.store %2, %arg2[%arg3] : memref<1024xi32>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir
index dc5be1d5f3..7ffd6698d0 100644
--- a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir
+++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir
@@ -1,7 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc
index 5c0c91ff42..036f7ecb85 100644
--- a/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc
@@ -4,7 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+
+#ifdef TO_CPP
 void dut(int32_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int32_t *in0_allocated, int32_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int32_t *in1_allocated,
+         int32_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int32_t *out0_allocated, int32_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
+
 void dut_ref(int32_t *in0, int32_t *in1, int32_t *out0);
 
 alignas(32) int32_t g_in0[IN0_SIZE];
@@ -26,7 +38,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();
 
diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir
new file mode 100644
index 0000000000..96c20735d6
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi32>
+    memref.assume_alignment %arg1, 32 : memref<1024xi32>
+    memref.assume_alignment %arg2, 32 : memref<1024xi32>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi32>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi32>
+      %2 = arith.minsi %0, %1 : i32
+      affine.store %2, %arg2[%arg3] : memref<1024xi32>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir
index bf2db4c50a..e0de66a437 100644
--- a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir
+++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir
@@ -1,7 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc
index b4e019a3ce..f8ee9c2716 100644
--- a/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc
@@ -4,7 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+
+#ifdef TO_CPP
 void dut(int32_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int32_t *in0_allocated, int32_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int32_t *in1_allocated,
+         int32_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int32_t *out0_allocated, int32_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
+
 void dut_ref(int32_t *in0, int32_t *in1, int32_t *out0);
 
 alignas(32) int32_t g_in0[IN0_SIZE];
@@ -26,7 +38,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();
 
diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir
new file mode 100644
index 0000000000..9745852914
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi8>
+    memref.assume_alignment %arg1, 32 : memref<1024xi8>
+    memref.assume_alignment %arg2, 32 : memref<1024xi8>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi8>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi8>
+      %2 = arith.maxsi %0, %1 : i8
+      affine.store %2, %arg2[%arg3] : memref<1024xi8>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir
index 6e869eb9e4..b69055b52a 100644
--- a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir
@@ -1,19 +1,22 @@
-// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 
 module {
   func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) {
-    %c0_i8 = arith.constant 0 : i8
-    affine.for %arg3 = 0 to 1024 step 32 {
-      %0 = vector.transfer_read %arg0[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8>
-      %1 = vector.transfer_read %arg1[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8>
-      %2 = arith.maxsi %0, %1 : vector<64xi8>
-      vector.transfer_write %2, %arg2[%arg3] : vector<64xi8>, memref<1024xi8>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi8>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi8>
+      %2 = arith.maxsi %0, %1 : i8
+      affine.store %2, %arg2[%arg3] : memref<1024xi8>
     }
     return
   }
diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc
index 6e3f1ba3d9..f9fdb84a62 100644
--- a/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc
@@ -4,7 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+
+#ifdef TO_CPP
 void dut(int8_t *restrict in0, int8_t *restrict in1, int8_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int8_t *in0_allocated, int8_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int8_t *in1_allocated,
+         int8_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int8_t *out0_allocated, int8_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
+
 void dut_ref(int8_t *in0, int8_t *in1, int8_t *out0);
 
 alignas(32) int8_t g_in0[IN0_SIZE];
@@ -26,7 +38,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();
 
diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir
new file mode 100644
index 0000000000..0b02971e0d
--- /dev/null
+++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// REQUIRES: peano
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" %vector-to-llvmir% -o llvmir.mlir
+// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
+// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) {
+    memref.assume_alignment %arg0, 32 : memref<1024xi8>
+    memref.assume_alignment %arg1, 32 : memref<1024xi8>
+    memref.assume_alignment %arg2, 32 : memref<1024xi8>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi8>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi8>
+      %2 = arith.minsi %0, %1 : i8
+      affine.store %2, %arg2[%arg3] : memref<1024xi8>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir
index 0e8522b990..dcdf2cc288 100644
--- a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir
@@ -1,19 +1,22 @@
-// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o
-// RUN: mkdir -p data
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mkdir -p %t/data; cd %t
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o
+// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o
 // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 
 module {
   func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) {
-    %c0_i8 = arith.constant 0 : i8
-    affine.for %arg3 = 0 to 1024 step 32 {
-      %0 = vector.transfer_read %arg0[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8>
-      %1 = vector.transfer_read %arg1[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8>
-      %2 = arith.minsi %0, %1 : vector<64xi8>
-      vector.transfer_write %2, %arg2[%arg3] : vector<64xi8>, memref<1024xi8>
+    affine.for %arg3 = 0 to 1024 {
+      %0 = affine.load %arg0[%arg3] : memref<1024xi8>
+      %1 = affine.load %arg1[%arg3] : memref<1024xi8>
+      %2 = arith.minsi %0, %1 : i8
+      affine.store %2, %arg2[%arg3] : memref<1024xi8>
     }
     return
   }
diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc
index b4b1102193..0aed91af2f 100644
--- a/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc
+++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc
@@ -4,7 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+
+#ifdef TO_CPP
 void dut(int8_t *restrict in0, int8_t *restrict in1, int8_t *restrict out0);
+#elif TO_LLVM
+extern "C" {
+void dut(int8_t *in0_allocated, int8_t *in0_aligned, int64_t in0_offset,
+         int64_t in0_sizes_0, int64_t in0_strides_0, int8_t *in1_allocated,
+         int8_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0,
+         int64_t in1_strides_0, int8_t *out0_allocated, int8_t *out0_aligned,
+         int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0);
+}
+#endif
+
 void dut_ref(int8_t *in0, int8_t *in1, int8_t *out0);
 
 alignas(32) int8_t g_in0[IN0_SIZE];
@@ -26,7 +38,11 @@ int main(int argc, char *argv[]) {
 
   chess_memory_fence();
   auto cyclesBegin = chess_cycle_count();
+#ifdef TO_CPP
   dut(g_in0, g_in1, g_out0);
+#elif TO_LLVM
+  dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0);
+#endif
   auto cyclesEnd = chess_cycle_count();
   chess_memory_fence();