diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index d11565316..fdacf7cbc 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -759,19 +759,19 @@ def run(self, config): ) # Test(s) of the form matmul(A,B) + truncf(C) where A:MxK, B:KxN and C:MxN - template_name = test_files_dir / "matmul_truncf_32x32x32_bf16_bf16.mlir" - identity_mat = np.eye(32, dtype=np.float32) - ones = np.ones(32 * 32, dtype=np.float32).reshape([32, 32]) - lhs = ones * 192 - rhs = identity_mat * 2 - input_args = generate_inputs( - template_name, output_dir, 1, {1: lhs, 2: rhs} - ) + test_name = output_dir / f"test_from_template_matmul_truncf.mlir" + template_name = matmul_template_dir / "matmul_truncf_MxK_KxN.mlir" + generate_matmul_test(test_name, template_name, 8, 8, 8, "bf16", "f32") + identity_mat = np.eye(8, dtype=np.float32) + ones = np.ones(8 * 8, dtype=np.float32).reshape([8, 8]) + lhs = ones * 101 + rhs = identity_mat * 3 + input_args = generate_inputs(test_name, output_dir, 1, {1: lhs, 2: rhs}) aie_vs_baseline( config, - template_name, + test_name, input_args, - lhs * 2, # exected output + ones * 302, # exected output use_ukernel=False, tile_pipeline="pack-peel", lower_to_aie_pipeline="objectFifo", @@ -780,7 +780,7 @@ def run(self, config): rtol=0, atol=0, n_repeats=1, - output_type=get_output_type(template_name), + output_type=get_output_type(test_name), ) diff --git a/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir b/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir deleted file mode 100644 index 39a58df13..000000000 --- a/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// These lines are required for e2e numerical testing: -// input 32x32xbf16 -// input 32x32xbf16 -// output 32x32xbf16 - -func.func @matmul_truncf(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> -{ - %cst = arith.constant 0.0 : f32 - %0 = tensor.empty() : tensor<32x32xf32> - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32x32xf32>) -> tensor<32x32xf32> - %2 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xbf16>, tensor<32x32xbf16>) - outs(%1: tensor<32x32xf32>) -> tensor<32x32xf32> - %3 = arith.truncf %2 : tensor<32x32xf32> to tensor<32x32xbf16> - return %3: tensor<32x32xbf16> -} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp index bca2a4e71..e5a797650 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp @@ -37,8 +37,9 @@ namespace { /// TODO(newling) improve this design. static bool isCoreComputeOp(Operation *op) { return isa(op); + memref::ExtractStridedMetadataOp, func::CallOp, arith::ExtFOp, + arith::TruncFOp, vector::TransferReadOp, vector::TransferWriteOp>( + op); } /// Utility to map the parallel mapping attributes to the corresponding