From dc19b296cf580500c212f4a7ac2c7a26de8b2382 Mon Sep 17 00:00:00 2001
From: Jerry Wu <cheyuw@google.com>
Date: Fri, 25 Aug 2023 18:17:14 +0000
Subject: [PATCH] Fix tile sizes and tests

---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        |   8 +-
 .../LLVMCPU/test/test_config_mmt4d.mlir       | 126 +++++++-----------
 2 files changed, 48 insertions(+), 86 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 4ac7d19ec9c2..6c33ad995634 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1154,8 +1154,8 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
   auto getDistTileSizes = [&]() -> SmallVector<int64_t> {
     if (!mmt4dDistributionTileSizes.empty()) {
       SmallVector<int64_t> tileSizes;
-      // If mmt4dDistributionTileSizes is set, tile batch dim to 1 + specified
-      // tile sizes.
+      // If mmt4dDistributionTileSizes is set, tile batch dim to 1 + the
+      // specified mmt4d tile sizes.
       tileSizes.push_back(1);
       tileSizes.append(mmt4dDistributionTileSizes.begin(),
                        mmt4dDistributionTileSizes.end());
@@ -1195,9 +1195,7 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
     int M0 = lhsShape[3];
     int N0 = rhsShape[3];
     int K0 = lhsShape[4];
-    tileSizes.push_back(M0);
-    tileSizes.push_back(N0);
-    tileSizes.push_back(K0);
+    tileSizes.append({1, 1, 1, M0, N0, K0});
     return tileSizes;
   };
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
index f8f37a070694..5e1d35931274 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
@@ -46,87 +46,51 @@ hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 {
 
 // -----
 
-// #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
-// #pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
-//   #hal.descriptor_set.layout<0, bindings = [
-//     #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
-//     #hal.descriptor_set.binding<1, storage_buffer>
-//   ]>
-// ]>
-// hal.executable private @batch_matmul {
-//   hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
-//     hal.executable.export public @batch_matmul ordinal(0) layout(#pipeline_layout)
-//     builtin.module {
-//       func.func @batch_matmul() {
-//         %c32_i64 = arith.constant 32 : i64
-//         %cst = arith.constant 0.000000e+00 : f32
-//         %c0 = arith.constant 0 : index
-//         %0 = hal.interface.constant.load[0] : i32
-//         %1 = hal.interface.constant.load[1] : i32
-//         %2 = hal.interface.constant.load[2] : i32
-//         %3 = hal.interface.constant.load[3] : i32
-//         %4 = arith.extui %0 : i32 to i64
-//         %5 = arith.extui %1 : i32 to i64
-//         %6 = arith.shli %5, %c32_i64 : i64
-//         %7 = arith.ori %4, %6 : i64
-//         %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
-//         %9 = arith.extui %2 : i32 to i64
-//         %10 = arith.extui %3 : i32 to i64
-//         %11 = arith.shli %10, %c32_i64 : i64
-//         %12 = arith.ori %9, %11 : i64
-//         %13 = arith.index_castui %12 : i64 to index
-//         %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
-//         %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
-//         %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
-//         %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
-//         %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
-//         %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
-//         %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
-//         %21 = linalg.batch_mmt4d {__internal_linalg_transform__ = "workgroup"} ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
-//         flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
-//         return
-//       }
-//     }
-//   }
-// }
-
-hal.executable private @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 {
-hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> {
-  hal.executable.export public @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 28, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32() {
-      %c32_i64 = arith.constant 32 : i64
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
-      %0 = hal.interface.constant.load[0] : i32
-      %1 = hal.interface.constant.load[1] : i32
-      %2 = hal.interface.constant.load[2] : i32
-      %3 = hal.interface.constant.load[3] : i32
-      %4 = arith.extui %0 : i32 to i64
-      %5 = arith.extui %1 : i32 to i64
-      %6 = arith.shli %5, %c32_i64 : i64
-      %7 = arith.ori %4, %6 : i64
-      %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
-      %9 = arith.extui %2 : i32 to i64
-      %10 = arith.extui %3 : i32 to i64
-      %11 = arith.shli %10, %c32_i64 : i64
-      %12 = arith.ori %9, %11 : i64
-      %13 = arith.index_castui %12 : i64 to index
-      %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
-      %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
-      %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
-      %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
-      %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
-      %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
-      %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
-      %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
-      flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
-      return
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @batch_mmt4d {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.export public @batch_mmt4d ordinal(0) layout(#pipeline_layout)
+    builtin.module {
+      func.func @batch_mmt4d() {
+        %c32_i64 = arith.constant 32 : i64
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %2 = hal.interface.constant.load[2] : i32
+        %3 = hal.interface.constant.load[3] : i32
+        %4 = arith.extui %0 : i32 to i64
+        %5 = arith.extui %1 : i32 to i64
+        %6 = arith.shli %5, %c32_i64 : i64
+        %7 = arith.ori %4, %6 : i64
+        %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
+        %9 = arith.extui %2 : i32 to i64
+        %10 = arith.extui %3 : i32 to i64
+        %11 = arith.shli %10, %c32_i64 : i64
+        %12 = arith.ori %9, %11 : i64
+        %13 = arith.index_castui %12 : i64 to index
+        %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
+        %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
+        %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
+        %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
+        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
+        %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
+        %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+        %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+        flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
+        return
+      }
     }
   }
 }
-}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 20, 0, 0, 0, 0], [1, 1, 1, 0, 8, 4, 0], [0, 0, 0, 1, 0, 0, 1]{{\]}}>
+//      CHECK: func.func @batch_mmt4d()
+//      CHECK:   linalg.batch_mmt4d
+// CHECK-SAME:     lowering_config = #[[CONFIG]]