Skip to content

Commit

Permalink
Fix TOSA broadcast and mixed precision tests (#631)
Browse files Browse the repository at this point in the history
Fix the following TOSA tests:
- bf16xbf16_sub_elem_2d_broadcast_2d
- i8xi16_sub_elem
Add the following new TOSA tests:
- i16xi16_sub_elem_2d_broadcast_scalar (pass)
- i16xi16_sub_elem_2d_broadcast_1d_unit_dim (pass)
- bf16xbf16_sub_elem_2d_broadcast_scalar (xfail)
  • Loading branch information
jamestcl-amd authored Sep 8, 2023
1 parent 83da581 commit ceb4dfe
Show file tree
Hide file tree
Showing 17 changed files with 373 additions and 23 deletions.
13 changes: 0 additions & 13 deletions lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1016,20 +1016,7 @@ struct LowerVectorAddOrSubOpToAIEVecAddElemOrSubElemOp
if (lhsExt && rhsExt) {
auto lval = lhsExt->getOperand(0);
auto rval = rhsExt->getOperand(0);

VectorType lSrcType = cast<VectorType>(lval.getType());
VectorType rSrcType = cast<VectorType>(rval.getType());

unsigned lBitWidth =
lSrcType.getElementType().getIntOrFloatBitWidth();
unsigned rBitWidth =
rSrcType.getElementType().getIntOrFloatBitWidth();

if ((lBitWidth != 8 || rBitWidth != 8) &&
(lBitWidth != 16 || rBitWidth != 16)) {
return genAddElemAieML<SrcOpTy, DstOpTy>(rewriter, lhs, rhs,
resultType, srcOp);
}

Type accType = getVectorOpDestType(lSrcType, /*AIEML =*/true);
auto lUpsOp =
Expand Down
29 changes: 21 additions & 8 deletions lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1878,17 +1878,17 @@ static LogicalResult printOperation(CppEmitter &emitter,
return success();
}

// Print an expand shape by forwarding the value to the next op
static LogicalResult printOperation(CppEmitter &emitter,
memref::ExpandShapeOp expandShapeOp) {
Value source = expandShapeOp.getSrc();
// Print an operation by forwarding the value to the next op
template <typename OpTy>
static LogicalResult printValueForwardOperation(CppEmitter &emitter, OpTy op) {
Value source = op.getSrc();

// If the memref being outputted is not already emitted,
// error out
if (!emitter.hasValueInScope(source))
return failure();

if (failed(emitter.emitAssignPrefix(*expandShapeOp)))
if (failed(emitter.emitAssignPrefix(*op)))
return failure();

raw_indented_ostream &os = emitter.ostream();
Expand All @@ -1898,6 +1898,20 @@ static LogicalResult printOperation(CppEmitter &emitter,
return success();
}

// Print an expand shape by forwarding the value to the next op
static LogicalResult printOperation(CppEmitter &emitter,
memref::ExpandShapeOp expandShapeOp) {
return printValueForwardOperation<memref::ExpandShapeOp>(emitter,
expandShapeOp);
}

// Print a collapse shape by forwarding the value to the next op
static LogicalResult printOperation(CppEmitter &emitter,
memref::CollapseShapeOp collapseShapeOp) {
return printValueForwardOperation<memref::CollapseShapeOp>(emitter,
collapseShapeOp);
}

static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,
Attribute value) {
OpResult result = operation->getResult(0);
Expand Down Expand Up @@ -2874,9 +2888,8 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
.Case<vector::TransferWriteOp>(
[&](auto op) { return printOperation(*this, op); })
// Memref ops.
.Case<memref::StoreOp>(
[&](auto op) { return printOperation(*this, op); })
.Case<memref::ExpandShapeOp>(
.Case<memref::StoreOp, memref::ExpandShapeOp,
memref::CollapseShapeOp>(
[&](auto op) { return printOperation(*this, op); })
.Case<aievec::AddOp, aievec::AddElemOp, aievec::ConcatOp,
aievec::ExtOp, aievec::FMAOp, aievec::MulOp, aievec::PackOp,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --linalg-fold-unit-extent-dims --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// clang-format off
void dut(bfloat16 * restrict v1, bfloat16 * restrict v2, bfloat16 * restrict v3) {
bfloat16 * restrict v4 = v2;
size_t v5 = 0;
size_t v6 = 16;
size_t v7 = 1;
for (size_t v8 = v5; v8 < v6; v8 += v7)
chess_prepare_for_pipelining
chess_loop_range(16, 16)
{
size_t v9 = 0;
size_t v10 = 1024;
size_t v11 = 16;
for (size_t v12 = v9; v12 < v10; v12 += v11)
chess_prepare_for_pipelining
chess_loop_range(64, 64)
{
v16bfloat16 v13 = *(v16bfloat16 *)(v1 + 1024*v8+v12);
v16bfloat16 v14 = *(v16bfloat16 *)(v4 + v12);
v16accfloat v15 = ups_to_v16accfloat(v13);
v16accfloat v16 = ups_to_v16accfloat(v14);
v16accfloat v17 = sub(v15, v16);
v16bfloat16 v18 = to_v16bfloat16(v17);
*(v16bfloat16 *)(v3 + 1024*v8+v12) = v18;
}
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xbf16>, %arg1: tensor<bf16>) -> (tensor<16x1024xbf16>) {
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<bf16>) -> (tensor<1x1xbf16>)
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xbf16>, tensor<1x1xbf16>) -> (tensor<16x1024xbf16>)
return %1 : tensor<16x1024xbf16>
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
bfloat16 *restrict out0);
void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_in1[IN1_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });
std::generate(g_in1, g_in1 + IN1_SIZE,
[&]() { return random_bfloat16(-10, 10, 2); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_in1, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_in1, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = in0[k] - in1[k % IN1_SIZE];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// clang-format off
void dut(int16_t * restrict v1, int16_t * restrict v2, int16_t * restrict v3) {
size_t v4 = 0;
v32int16 v5 = *(v32int16 *)(v2 + v4);
v32int16 v6 = broadcast_elem(v5, 0);
size_t v7 = 0;
size_t v8 = 16;
size_t v9 = 1;
for (size_t v10 = v7; v10 < v8; v10 += v9)
chess_prepare_for_pipelining
chess_loop_range(16, 16)
{
size_t v11 = 0;
size_t v12 = 1024;
size_t v13 = 32;
for (size_t v14 = v11; v14 < v12; v14 += v13)
chess_prepare_for_pipelining
chess_loop_range(32, 32)
{
v32int16 v15 = *(v32int16 *)(v1 + 1024*v10+v14);
v32int16 v16 = sub(v15, v6);
*(v32int16 *)(v3 + 1024*v10+v14) = v16;
}
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc >& xchesscc_wrapper.stdout
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xi16>, %arg1: tensor<1xi16>) -> (tensor<16x1024xi16>) {
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<1xi16>) -> (tensor<1x1xi16>)
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xi16>, tensor<1x1xi16>) -> (tensor<16x1024xi16>)
return %1 : tensor<16x1024xi16>
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0);
void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0);

alignas(32) int16_t g_in0[IN0_SIZE];
alignas(32) int16_t g_in1[IN1_SIZE];
alignas(32) int16_t g_out0[OUT0_SIZE];
alignas(32) int16_t g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_integer<int16_t>(); });
std::generate(g_in1, g_in1 + IN1_SIZE,
[&]() { return random_integer<int16_t>(); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_in1, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_in1, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = in0[k] - in1[k % IN1_SIZE];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once
constexpr unsigned const IN0_SIZE = 16 * 1024;
constexpr unsigned const IN1_SIZE = 1;
constexpr unsigned const OUT0_SIZE = 16 * 1024;
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// clang-format off
void dut(int16_t * restrict v1, int16_t * restrict v2, int16_t * restrict v3) {
size_t v4 = 0;
int16_t * restrict v5 = v2;
v32int16 v6 = *(v32int16 *)(v5 + v4);
v32int16 v7 = broadcast_elem(v6, 0);
size_t v8 = 0;
size_t v9 = 16;
size_t v10 = 1;
for (size_t v11 = v8; v11 < v9; v11 += v10)
chess_prepare_for_pipelining
chess_loop_range(16, 16)
{
size_t v12 = 0;
size_t v13 = 1024;
size_t v14 = 32;
for (size_t v15 = v12; v15 < v13; v15 += v14)
chess_prepare_for_pipelining
chess_loop_range(32, 32)
{
v32int16 v16 = *(v32int16 *)(v1 + 1024*v11+v15);
v32int16 v17 = sub(v16, v7);
*(v32int16 *)(v3 + 1024*v11+v15) = v17;
}
}
return;
}
// clang-format on
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Copyright (C) 2023, Advanced Micro Devices, Inc.

// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir
// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc >& xchesscc_wrapper.stdout
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED

module {
func.func @dut(%arg0: tensor<16x1024xi16>, %arg1: tensor<i16>) -> (tensor<16x1024xi16>) {
%0 = "tosa.reshape"(%arg1) { new_shape = array<i64: 1, 1>} : (tensor<i16>) -> (tensor<1x1xi16>)
%1 = "tosa.sub"(%arg0,%0) : (tensor<16x1024xi16>, tensor<1x1xi16>) -> (tensor<16x1024xi16>)
return %1 : tensor<16x1024xi16>
}
}


Loading

0 comments on commit ceb4dfe

Please sign in to comment.