diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp index 5798114972..866689c0a0 100644 --- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp +++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp @@ -1016,20 +1016,7 @@ struct LowerVectorAddOrSubOpToAIEVecAddElemOrSubElemOp if (lhsExt && rhsExt) { auto lval = lhsExt->getOperand(0); auto rval = rhsExt->getOperand(0); - VectorType lSrcType = cast(lval.getType()); - VectorType rSrcType = cast(rval.getType()); - - unsigned lBitWidth = - lSrcType.getElementType().getIntOrFloatBitWidth(); - unsigned rBitWidth = - rSrcType.getElementType().getIntOrFloatBitWidth(); - - if ((lBitWidth != 8 || rBitWidth != 8) && - (lBitWidth != 16 || rBitWidth != 16)) { - return genAddElemAieML(rewriter, lhs, rhs, - resultType, srcOp); - } Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true); auto lUpsOp = diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp index caf9829198..a56f6feb4a 100644 --- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp +++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp @@ -1878,17 +1878,17 @@ static LogicalResult printOperation(CppEmitter &emitter, return success(); } -// Print an expand shape by forwarding the value to the next op -static LogicalResult printOperation(CppEmitter &emitter, - memref::ExpandShapeOp expandShapeOp) { - Value source = expandShapeOp.getSrc(); +// Print an operation by forwarding the value to the next op +template +static LogicalResult printValueForwardOperation(CppEmitter &emitter, OpTy op) { + Value source = op.getSrc(); // If the memref being outputted is not already emitted, // error out if (!emitter.hasValueInScope(source)) return failure(); - if (failed(emitter.emitAssignPrefix(*expandShapeOp))) + if (failed(emitter.emitAssignPrefix(*op))) return failure(); raw_indented_ostream &os = emitter.ostream(); @@ -1898,6 +1898,20 @@ static LogicalResult printOperation(CppEmitter &emitter, return success(); } +// Print an expand shape by forwarding the value to the next op +static LogicalResult printOperation(CppEmitter &emitter, + memref::ExpandShapeOp expandShapeOp) { + return printValueForwardOperation(emitter, + expandShapeOp); +} + +// Print a collapse shape by forwarding the value to the next op +static LogicalResult printOperation(CppEmitter &emitter, + memref::CollapseShapeOp collapseShapeOp) { + return printValueForwardOperation(emitter, + collapseShapeOp); +} + static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation, Attribute value) { OpResult result = operation->getResult(0); @@ -2874,9 +2888,8 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) { .Case( [&](auto op) { return printOperation(*this, op); }) // Memref ops. - .Case( - [&](auto op) { return printOperation(*this, op); }) - .Case( + .Case( [&](auto op) { return printOperation(*this, op); }) .Case& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor) -> (tensor<16x1024xbf16>) { + %0 = "tosa.reshape"(%arg1) { new_shape = array} : (tensor) -> (tensor<1x1xbf16>) + %1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1xbf16>) -> (tensor<16x1024xbf16>) + return %1 : tensor<16x1024xbf16> + } +} + + diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/defines.h b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/defines.h new file mode 100644 index 0000000000..1cc95cb4d3 --- /dev/null +++ b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 16 * 1024; +constexpr unsigned const IN1_SIZE = 1; +constexpr unsigned const OUT0_SIZE = 16 * 1024; diff --git a/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/testbench.cc b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/testbench.cc new file mode 100644 index 0000000000..a70acfe326 --- /dev/null +++ b/test/Integration/Dialect/TOSA/bf16xbf16_sub_elem_2d_broadcast_scalar/testbench.cc @@ -0,0 +1,56 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(bfloat16 *restrict in0, bfloat16 *restrict in1, + bfloat16 *restrict out0); +void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0); + +alignas(32) bfloat16 g_in0[IN0_SIZE]; +alignas(32) bfloat16 g_in1[IN1_SIZE]; +alignas(32) bfloat16 g_out0[OUT0_SIZE]; +alignas(32) bfloat16 g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_bfloat16(-10, 10, 2); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_bfloat16(-10, 10, 2); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] - in1[k % IN1_SIZE]; + } +} diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/defines.h b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/defines.h new file mode 100644 index 0000000000..1cc95cb4d3 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 16 * 1024; +constexpr unsigned const IN1_SIZE = 1; +constexpr unsigned const OUT0_SIZE = 16 * 1024; diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/dut.cc b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/dut.cc new file mode 100644 index 0000000000..73d36f406d --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/dut.cc @@ -0,0 +1,27 @@ +// clang-format off +void dut(int16_t * restrict v1, int16_t * restrict v2, int16_t * restrict v3) { + size_t v4 = 0; + v32int16 v5 = *(v32int16 *)(v2 + v4); + v32int16 v6 = broadcast_elem(v5, 0); + size_t v7 = 0; + size_t v8 = 16; + size_t v9 = 1; + for (size_t v10 = v7; v10 < v8; v10 += v9) + chess_prepare_for_pipelining + chess_loop_range(16, 16) + { + size_t v11 = 0; + size_t v12 = 1024; + size_t v13 = 32; + for (size_t v14 = v11; v14 < v12; v14 += v13) + chess_prepare_for_pipelining + chess_loop_range(32, 32) + { + v32int16 v15 = *(v32int16 *)(v1 + 1024*v10+v14); + v32int16 v16 = sub(v15, v6); + *(v32int16 *)(v3 + 1024*v10+v14) = v16; + } + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir new file mode 100644 index 0000000000..8e17485218 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir +// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc >& xchesscc_wrapper.stdout +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<16x1024xi16>, %arg1: tensor<1xi16>) -> (tensor<16x1024xi16>) { + %0 = "tosa.reshape"(%arg1) { new_shape = array} : (tensor<1xi16>) -> (tensor<1x1xi16>) + %1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xi16>, tensor<1x1xi16>) -> (tensor<16x1024xi16>) + return %1 : tensor<16x1024xi16> + } +} + + diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/testbench.cc b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/testbench.cc new file mode 100644 index 0000000000..06e28e9975 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_1d_unit_dim/testbench.cc @@ -0,0 +1,55 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0); +void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0); + +alignas(32) int16_t g_in0[IN0_SIZE]; +alignas(32) int16_t g_in1[IN1_SIZE]; +alignas(32) int16_t g_out0[OUT0_SIZE]; +alignas(32) int16_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] - in1[k % IN1_SIZE]; + } +} diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/defines.h b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/defines.h new file mode 100644 index 0000000000..1cc95cb4d3 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/defines.h @@ -0,0 +1,4 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 16 * 1024; +constexpr unsigned const IN1_SIZE = 1; +constexpr unsigned const OUT0_SIZE = 16 * 1024; diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/dut.cc b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/dut.cc new file mode 100644 index 0000000000..6d23a88b05 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/dut.cc @@ -0,0 +1,28 @@ +// clang-format off +void dut(int16_t * restrict v1, int16_t * restrict v2, int16_t * restrict v3) { + size_t v4 = 0; + int16_t * restrict v5 = v2; + v32int16 v6 = *(v32int16 *)(v5 + v4); + v32int16 v7 = broadcast_elem(v6, 0); + size_t v8 = 0; + size_t v9 = 16; + size_t v10 = 1; + for (size_t v11 = v8; v11 < v9; v11 += v10) + chess_prepare_for_pipelining + chess_loop_range(16, 16) + { + size_t v12 = 0; + size_t v13 = 1024; + size_t v14 = 32; + for (size_t v15 = v12; v15 < v13; v15 += v14) + chess_prepare_for_pipelining + chess_loop_range(32, 32) + { + v32int16 v16 = *(v32int16 *)(v1 + 1024*v11+v15); + v32int16 v17 = sub(v16, v7); + *(v32int16 *)(v3 + 1024*v11+v15) = v17; + } + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir new file mode 100644 index 0000000000..85fa8de4c0 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/i16xi16_sub_elem_2d_broadcast_1d_unit_dim.mlir @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir +// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir +// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir +// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc >& xchesscc_wrapper.stdout +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: tensor<16x1024xi16>, %arg1: tensor) -> (tensor<16x1024xi16>) { + %0 = "tosa.reshape"(%arg1) { new_shape = array} : (tensor) -> (tensor<1x1xi16>) + %1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xi16>, tensor<1x1xi16>) -> (tensor<16x1024xi16>) + return %1 : tensor<16x1024xi16> + } +} + + diff --git a/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/testbench.cc b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/testbench.cc new file mode 100644 index 0000000000..06e28e9975 --- /dev/null +++ b/test/Integration/Dialect/TOSA/i16xi16_sub_elem_2d_broadcast_scalar/testbench.cc @@ -0,0 +1,55 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include +void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0); +void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0); + +alignas(32) int16_t g_in0[IN0_SIZE]; +alignas(32) int16_t g_in1[IN1_SIZE]; +alignas(32) int16_t g_out0[OUT0_SIZE]; +alignas(32) int16_t g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_integer(); }); + std::generate(g_in1, g_in1 + IN1_SIZE, + [&]() { return random_integer(); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_in1, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_in1, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0) { + for (unsigned k = 0; k < OUT0_SIZE; k += 1) { + out0[k] = in0[k] - in1[k % IN1_SIZE]; + } +} diff --git a/test/Integration/Dialect/TOSA/i8xi16_sub_elem/dut.cc b/test/Integration/Dialect/TOSA/i8xi16_sub_elem/dut.cc new file mode 100644 index 0000000000..c4c40feefd --- /dev/null +++ b/test/Integration/Dialect/TOSA/i8xi16_sub_elem/dut.cc @@ -0,0 +1,20 @@ +// clang-format off +void dut(int8_t * restrict v1, int16_t * restrict v2, int32_t * restrict v3) { + size_t v4 = 0; + size_t v5 = 1024; + size_t v6 = 32; + for (size_t v7 = v4; v7 < v5; v7 += v6) + chess_prepare_for_pipelining + chess_loop_range(32, 32) + { + v32int8 v8 = *(v32int8 *)(v1 + v7); + v32int16 v9 = *(v32int16 *)(v2 + v7); + v32acc32 v10 = ups_to_v32acc32(v8, 0); + v32acc32 v11 = ups_to_v32acc32(v9, 0); + v32acc32 v12 = sub(v10, v11); + v32int32 v13 = v32int32(v12); + *(v32int32 *)(v3 + v7) = v13; + } + return; +} +// clang-format on diff --git a/test/Integration/Dialect/TOSA/i8xi16_sub_elem/i8xi16_sub_elem.mlir b/test/Integration/Dialect/TOSA/i8xi16_sub_elem/i8xi16_sub_elem.mlir index 5ff09fae6d..363f0f1343 100644 --- a/test/Integration/Dialect/TOSA/i8xi16_sub_elem/i8xi16_sub_elem.mlir +++ b/test/Integration/Dialect/TOSA/i8xi16_sub_elem/i8xi16_sub_elem.mlir @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Copyright (C) 2023, Advanced Micro Devices, Inc. -// XFAIL: * // REQUIRES: valid_xchess_license // RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir