nod-ai · Abhishek-Varma · Oct 22, 2024 · Oct 28, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -526,7 +526,7 @@ def aie_vs_llvm_cpu(
     test_file,
     use_ukernel=False,
     tile_pipeline="pad-pack",
-    lower_to_aie_pipeline="air",
+    lower_to_aie_pipeline="objectFifo",
     function_name=None,
     seed=1,
     rtol=1e-6,
@@ -675,58 +675,61 @@ def run(self, config):
         test_files_dir = config.file_dir / "test_files"
         output_dir = config.output_dir
 
+        # AIR tests have been disabled currently due to change of infra to :-
+        #   bufferization -> insert-cores -> vectorization (@erwei-xilinx)
         # Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
-        test_name = output_dir / "test_from_template_full_bias.mlir"
-        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
-        generate_matmul_test(test_name, template_name, 128, 128, 256, "i32", "i32")
-        aie_vs_llvm_cpu(
-            config,
-            test_name,
-            tile_pipeline="pack-peel",
-            lower_to_aie_pipeline="air",
-            rtol=0,
-            atol=0,
-        )
-
-        if config.xdna_datetime and config.xdna_datetime < 20240801:
-            for name in [
-                "two_matmul_switching",
-                "matmul_f32_8_8_4",
-                "matmul_f32_8_4_8",
-            ]:
-                aie_vs_llvm_cpu(config, test_files_dir / f"{name}.mlir")
-
-            aie_vs_llvm_cpu(
-                config,
-                test_files_dir / "three_matmuls.mlir",
-                function_name="three_$mm$",
-            )
+        # test_name = output_dir / "test_from_template_full_bias.mlir"
+        # template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
+        # generate_matmul_test(test_name, template_name, 128, 128, 256, "i32", "i32")
+        # aie_vs_llvm_cpu(
+        #     config,
+        #     test_name,
+        #     tile_pipeline="pack-peel",
+        #     lower_to_aie_pipeline="air",
+        #     rtol=0,
+        #     atol=0,
+        # )
+
+        # if config.xdna_datetime and config.xdna_datetime < 20240801:
+        #     for name in [
+        #         "two_matmul_switching",
+        #         "matmul_f32_8_8_4",
+        #         "matmul_f32_8_4_8",
+        #     ]:
+        #         aie_vs_llvm_cpu(config, test_files_dir / f"{name}.mlir", lower_to_aie_pipeline="air")
+
+        #     aie_vs_llvm_cpu(
+        #         config,
+        #         test_files_dir / "three_matmuls.mlir",
+        #         lower_to_aie_pipeline="air",
+        #         function_name="three_$mm$",
+        #     )
 
         # Test(s) of the form matmul(A,B) where A:MxK, B:KxN
-        test_name = output_dir / "test_from_template.mlir"
-        template_name = matmul_template_dir / "matmul_MxK_KxN.mlir"
-        generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32")
-        aie_vs_llvm_cpu(config, test_name)
+        # test_name = output_dir / "test_from_template.mlir"
+        # template_name = matmul_template_dir / "matmul_MxK_KxN.mlir"
+        # generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32")
+        # aie_vs_llvm_cpu(config, test_name, lower_to_aie_pipeline="air")
 
         # Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
-        test_name = output_dir / "test_from_template_bias_N.mlir"
-        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
-        generate_matmul_test(test_name, template_name, 1024, 1024, 512, "bf16", "f32")
-        if config.vitis_dir:
-            aie_vs_llvm_cpu(
-                config,
-                test_name,
-                tile_pipeline="pack-peel",
-                lower_to_aie_pipeline="air",
-                use_ukernel=True,
-            )
-        aie_vs_llvm_cpu(
-            config,
-            test_name,
-            tile_pipeline="pack-peel",
-            lower_to_aie_pipeline="air",
-            use_ukernel=False,
-        )
+        # test_name = output_dir / "test_from_template_bias_N.mlir"
+        # template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
+        # generate_matmul_test(test_name, template_name, 1024, 1024, 512, "bf16", "f32")
+        # if config.vitis_dir:
+        #     aie_vs_llvm_cpu(
+        #         config,
+        #         test_name,
+        #         tile_pipeline="pack-peel",
+        #         lower_to_aie_pipeline="air",
+        #         use_ukernel=True,
+        #     )
+        # aie_vs_llvm_cpu(
+        #     config,
+        #     test_name,
+        #     tile_pipeline="pack-peel",
+        #     lower_to_aie_pipeline="air",
+        #     use_ukernel=False,
+        # )
 
         # Test(s) of the form batch_matmul(A,B) where A:BxMxK, B:BxKxN
         template_name = matmul_template_dir / "batch_matmul_BxMxK_BxKxN.mlir"
@@ -783,6 +786,31 @@ def run(self, config):
             output_type=get_output_type(test_name),
         )
 
+        # Large shape Matmul + Truncf
+        generate_matmul_test(test_name, template_name, 128, 128, 256, "bf16", "f32")
+        identity_mat = np.eye(128, dtype=np.float32)
+        lhs_ones = np.ones(128 * 256, dtype=np.float32).reshape([128, 256])
+        rhs_ones = np.ones(256 * 128, dtype=np.float32).reshape([256, 128])
+        out_ones = np.ones(128 * 128, dtype=np.float32).reshape([128, 128])
+        lhs = lhs_ones * 2
+        rhs = rhs_ones * 3
+        input_args = generate_inputs(test_name, output_dir, 1, {1: lhs, 2: rhs})
+        aie_vs_baseline(
+            config,
+            test_name,
+            input_args,
+            out_ones * 1536,  # exected output
+            use_ukernel=False,
+            tile_pipeline="pack-peel",
+            lower_to_aie_pipeline="objectFifo",
+            function_name=None,
+            seed=1,
+            rtol=0,
+            atol=0,
+            n_repeats=1,
+            output_type=get_output_type(test_name),
+        )
+
 
 class SmokeSet(TestSet):
     def __init__(self):
@@ -793,8 +821,8 @@ def run(self, config):
         output_dir = config.output_dir
 
         # The most basic test, direct from .mlir file using all defaults
-        test_files_dir = file_dir / "test_files"
-        aie_vs_llvm_cpu(config, test_files_dir / "matmul_int32.mlir")
+        # test_files_dir = file_dir / "test_files"
+        # aie_vs_llvm_cpu(config, test_files_dir / "matmul_int32.mlir")
 
         # Using objectFifo pipeline
         test_name = output_dir / "test_from_template.mlir"

diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
@@ -593,20 +593,22 @@ run_matmul_test \
     --acc_type "i32" \
     --m "64"  --n "64" --k "128"
 
-run_matmul_test \
-    --name_prefix "packPeel_bf16" \
-    --tile_pipeline "pack-peel" \
-    --lhs_rhs_type "bf16" \
-    --acc_type "f32" \
-    --m "512"  --n "512" --k "512"
-
-run_matmul_test \
-  --name_prefix "packPeel_t_bf16" \
-  --tile_pipeline "pack-peel" \
-  --lhs_rhs_type "bf16" \
-  --acc_type "f32" \
-  --m "128" --n "256" --k "512" \
-  --do_transpose_rhs "1"
+# AIR tests have been disabled currently due to change of infra to :-
+#   bufferization -> insert-cores -> vectorization (@erwei-xilinx)
+# run_matmul_test \
+#     --name_prefix "packPeel_bf16" \
+#     --tile_pipeline "pack-peel" \
+#     --lhs_rhs_type "bf16" \
+#     --acc_type "f32" \
+#     --m "512"  --n "512" --k "512"
+
+# run_matmul_test \
+#   --name_prefix "packPeel_t_bf16" \
+#   --tile_pipeline "pack-peel" \
+#   --lhs_rhs_type "bf16" \
+#   --acc_type "f32" \
+#   --m "128" --n "256" --k "512" \
+#   --do_transpose_rhs "1"
 
 ###################################################################
 # ObjectFifo Matmul tests

@@ -26,6 +26,7 @@
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -39,6 +40,92 @@ namespace mlir::iree_compiler::aievec {
 
 using namespace mlir;
 
+/// Utility to check if the indices provided are all 0.
+static LogicalResult isAllZeroOffsetAccess(mlir::OperandRange indices) {
+  if (!llvm::all_of(indices, [](Value val) {
+        IntegerAttr attr;
+        if (!matchPattern(val, m_Constant(&attr))) return false;
+        return attr.getInt() == 0;
+      })) {
+    return failure();
+  }
+  return success();
+}
+
+/// Utility to fetch indices of Subview op which would be used by a new vector
+/// transfer_read/transfer_write op with trivial access pattern.
+static SmallVector<Value> fetchNewIndices(PatternRewriter &rewriter,
+                                          Location loc,
+                                          memref::SubViewOp subViewOp) {
+  SmallVector<Value> newIndices;
+  for (OpFoldResult offset : subViewOp.getMixedOffsets()) {
+    Value indexVal;
+    if (auto attr = dyn_cast<Attribute>(offset)) {
+      indexVal = rewriter.create<arith::ConstantIndexOp>(
+          loc, cast<IntegerAttr>(attr).getInt());
+    } else {
+      indexVal = cast<Value>(offset);
+    }
+    newIndices.push_back(indexVal);
+  }
+  return newIndices;
+}
+
+/// A rewriter function to canonicalize the following :-
+/// INPUT:
+///       %b = memref.subview %a [offset0, offset1, ...]
+///       %c = vector.transfer_read %b[0, 0, ...]
+/// OUTPUT:
+///       %c = vector.transfer_read %a[offset0, offset1, ...]
+///
+/// This is needed to enable other set of staged canonicalizations in this pass.
+struct CanonicalizeTrivialReadAccessSubviewOpPattern
+    : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
+        readOp.getSource().getDefiningOp());
+    if (!subViewOp) return failure();
+    if (failed(isAllZeroOffsetAccess(readOp.getIndices()))) return failure();
+    SmallVector<Value> newIndices =
+        fetchNewIndices(rewriter, readOp.getLoc(), subViewOp);
+    rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+        readOp, readOp.getType(), subViewOp.getSource(), newIndices,
+        readOp.getPadding(), readOp.getInBoundsValues());
+    return success();
+  }
+};
+
+/// A rewriter function to canonicalize the following :-
+/// INPUT:
+///       %b = memref.subview %a [offset0, offset1, ...]
+///       vector.transfer_write %val, %b[0, 0, ...]
+/// OUTPUT:
+///       vector.transfer_write %val, %a[offset0, offset1, ...]
+///
+/// This is needed to enable other set of staged canonicalizations in this pass.
+struct CanonicalizeTrivialWriteAccessSubviewOpPattern
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
+        writeOp.getSource().getDefiningOp());
+    if (!subViewOp) return failure();
+    if (failed(isAllZeroOffsetAccess(writeOp.getIndices()))) return failure();
+    SmallVector<Value> newIndices =
+        fetchNewIndices(rewriter, writeOp.getLoc(), subViewOp);
+    rewriter.create<vector::TransferWriteOp>(
+        writeOp.getLoc(), writeOp.getVector(), subViewOp.getSource(),
+        newIndices, writeOp.getInBoundsValues());
+    rewriter.eraseOp(writeOp);
+    return success();
+  }
+};
+
 static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
   if (op.getKind() != vector::CombiningKind::ADD) return false;
 
@@ -628,6 +715,12 @@ struct CanonicalizeVectorForAIEVecPass
     auto op = getOperation();
     MLIRContext *context = &getContext();
 
+    {
+      RewritePatternSet patterns(context);
+      patterns.add<CanonicalizeTrivialReadAccessSubviewOpPattern,
+                   CanonicalizeTrivialWriteAccessSubviewOpPattern>(context);
+      (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+    }
     {
       // These must run before 'populateVectorBroadcastLoweringPatterns'
       // so that broadcasts can be matched before conversion to insert.

@@ -167,3 +167,42 @@ func.func @arith_truncf(%inp: vector<2x3xf32>) -> vector<2x3xbf16> {
     %0 = arith.truncf %inp : vector<2x3xf32> to vector<2x3xbf16>
     return %0 : vector<2x3xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: @trivial_read_access
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>)
+// CHECK-NOT:     memref.subview
+// CHECK:         %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
+// CHECK-SAME:        into memref<1024xbf16, strided<[1]>>
+// CHECK:         %[[READ:.*]] = vector.transfer_read %[[COLLAPSE_SHAPE]]
+// CHECK:         %[[SHAPE_CAST:.*]] = vector.shape_cast %[[READ]]
+// CHECK-SAME:        vector<32xbf16> to vector<1x1x4x8xbf16>
+// CHECK:         return %[[SHAPE_CAST]]
+func.func @trivial_read_access(%arg0: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>) -> vector<1x1x4x8xbf16> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>> to memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: 608>>
+    %read = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: 608>>, vector<1x1x4x8xbf16>
+    return %read : vector<1x1x4x8xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @trivial_write_access
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>,
+// CHECK-SAME:   %[[ARG1:.*]]: vector<1x1x4x4xf32>)
+// CHECK-NOT:       memref.subview
+// CHECK:           %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
+// CHECK-SAME:          : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> into memref<1024xf32, strided<[1]>>
+// CHECK:           %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG1]]
+// CHECK-SAME:          : vector<1x1x4x4xf32> to vector<16xf32>
+// CHECK:           vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE_SHAPE]]
+// CHECK:           return
+func.func @trivial_write_access(%arg0: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>, %arg1: vector<1x1x4x4xf32>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> to memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
+    vector.transfer_write %arg1, %subview[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x4x4xf32>, memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
+    return
+}
@@ -6,18 +6,32 @@
 
 iree_lit_test_suite(
   NAME
-    lit
+    lit_objectFifo
   SRCS
     "conv2d_nhwc_objectfifo_e2e.mlir"
-    "matmul_elementwise_pack_peel_air_e2e.mlir"
-    "matmul_pack_peel_air_e2e.mlir"
     "matmul_pack_peel_objectfifo.mlir"
     "matmul_pack_peel_objectfifo_e2e.mlir"
-    "matmul_pad_pack_air_e2e.mlir"
+    "matmul_elementwise_pack_peel_objectfifo_e2e.mlir"
     "xdna_oplib_plugin.mlir"
   TOOLS
     ${IREE_LLD_TARGET}
     FileCheck
     iree-opt
     iree-compile
 )
+
+# AIR tests have been disabled currently due to change of infra to :-
+#   bufferization -> insert-cores -> vectorization (@erwei-xilinx)
+# iree_lit_test_suite(
+#  NAME
+#    lit_air
+#  SRCS
+#    "matmul_elementwise_pack_peel_air_e2e.mlir"
+#    "matmul_pack_peel_air_e2e.mlir"
+#    "matmul_pad_pack_air_e2e.mlir"
+#  TOOLS
+#    ${IREE_LLD_TARGET}
+#    FileCheck
+#    iree-opt
+#    iree-compile
+# )