diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index d11565316..fdacf7cbc 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -759,19 +759,19 @@ def run(self, config):
             )
 
         # Test(s) of the form matmul(A,B) + truncf(C) where A:MxK, B:KxN and C:MxN
-        template_name = test_files_dir / "matmul_truncf_32x32x32_bf16_bf16.mlir"
-        identity_mat = np.eye(32, dtype=np.float32)
-        ones = np.ones(32 * 32, dtype=np.float32).reshape([32, 32])
-        lhs = ones * 192
-        rhs = identity_mat * 2
-        input_args = generate_inputs(
-            template_name, output_dir, 1, {1: lhs, 2: rhs}
-        )
+        test_name = output_dir / f"test_from_template_matmul_truncf.mlir"
+        template_name = matmul_template_dir / "matmul_truncf_MxK_KxN.mlir"
+        generate_matmul_test(test_name, template_name, 8, 8, 8, "bf16", "f32")
+        identity_mat = np.eye(8, dtype=np.float32)
+        ones = np.ones(8 * 8, dtype=np.float32).reshape([8, 8])
+        lhs = ones * 101
+        rhs = identity_mat * 3
+        input_args = generate_inputs(test_name, output_dir, 1, {1: lhs, 2: rhs})
         aie_vs_baseline(
             config,
-            template_name,
+            test_name,
             input_args,
-            lhs * 2,  # exected output
+            ones * 302,  # exected output
             use_ukernel=False,
             tile_pipeline="pack-peel",
             lower_to_aie_pipeline="objectFifo",
@@ -780,7 +780,7 @@ def run(self, config):
             rtol=0,
             atol=0,
             n_repeats=1,
-            output_type=get_output_type(template_name),
+            output_type=get_output_type(test_name),
         )
 
 
diff --git a/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir b/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir
deleted file mode 100644
index 39a58df13..000000000
--- a/build_tools/ci/cpu_comparison/test_files/matmul_truncf_32x32x32_bf16_bf16.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// These lines are required for e2e numerical testing:
-// input 32x32xbf16
-// input 32x32xbf16
-// output 32x32xbf16
-
-func.func @matmul_truncf(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16>
-{
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.empty() : tensor<32x32xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32x32xf32>) -> tensor<32x32xf32>
-  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xbf16>, tensor<32x32xbf16>)
-    outs(%1: tensor<32x32xf32>) -> tensor<32x32xf32>
-  %3 = arith.truncf %2 : tensor<32x32xf32> to tensor<32x32xbf16>
-  return %3: tensor<32x32xbf16>
-}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
index bca2a4e71..e5a797650 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
@@ -37,8 +37,9 @@ namespace {
 /// TODO(newling) improve this design.
 static bool isCoreComputeOp(Operation *op) {
   return isa<linalg::LinalgOp, vector::ContractionOp,
-             memref::ExtractStridedMetadataOp, func::CallOp, arith::TruncFOp,
-             vector::TransferReadOp, vector::TransferWriteOp>(op);
+             memref::ExtractStridedMetadataOp, func::CallOp, arith::ExtFOp,
+             arith::TruncFOp, vector::TransferReadOp, vector::TransferWriteOp>(
+      op);
 }
 
 /// Utility to map the parallel mapping attributes to the corresponding