From dc19b296cf580500c212f4a7ac2c7a26de8b2382 Mon Sep 17 00:00:00 2001 From: Jerry Wu Date: Fri, 25 Aug 2023 18:17:14 +0000 Subject: [PATCH] Fix tile sizes and tests --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 8 +- .../LLVMCPU/test/test_config_mmt4d.mlir | 126 +++++++----------- 2 files changed, 48 insertions(+), 86 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 4ac7d19ec9c2..6c33ad995634 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -1154,8 +1154,8 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn, auto getDistTileSizes = [&]() -> SmallVector { if (!mmt4dDistributionTileSizes.empty()) { SmallVector tileSizes; - // If mmt4dDistributionTileSizes is set, tile batch dim to 1 + specified - // tile sizes. + // If mmt4dDistributionTileSizes is set, tile batch dim to 1 + the + // specified mmt4d tile sizes. tileSizes.push_back(1); tileSizes.append(mmt4dDistributionTileSizes.begin(), mmt4dDistributionTileSizes.end()); @@ -1195,9 +1195,7 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn, int M0 = lhsShape[3]; int N0 = rhsShape[3]; int K0 = lhsShape[4]; - tileSizes.push_back(M0); - tileSizes.push_back(N0); - tileSizes.push_back(K0); + tileSizes.append({1, 1, 1, M0, N0, K0}); return tileSizes; }; diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir index f8f37a070694..5e1d35931274 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir @@ -46,87 +46,51 @@ hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 { // ----- -// #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> -// #pipeline_layout = #hal.pipeline.layout, -// #hal.descriptor_set.binding<1, storage_buffer> -// ]> -// ]> -// hal.executable private @batch_matmul { -// hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { -// hal.executable.export public @batch_matmul ordinal(0) layout(#pipeline_layout) -// builtin.module { -// func.func @batch_matmul() { -// %c32_i64 = arith.constant 32 : i64 -// %cst = arith.constant 0.000000e+00 : f32 -// %c0 = arith.constant 0 : index -// %0 = hal.interface.constant.load[0] : i32 -// %1 = hal.interface.constant.load[1] : i32 -// %2 = hal.interface.constant.load[2] : i32 -// %3 = hal.interface.constant.load[3] : i32 -// %4 = arith.extui %0 : i32 to i64 -// %5 = arith.extui %1 : i32 to i64 -// %6 = arith.shli %5, %c32_i64 : i64 -// %7 = arith.ori %4, %6 : i64 -// %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index -// %9 = arith.extui %2 : i32 to i64 -// %10 = arith.extui %3 : i32 to i64 -// %11 = arith.shli %10, %c32_i64 : i64 -// %12 = arith.ori %9, %11 : i64 -// %13 = arith.index_castui %12 : i64 to index -// %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> -// %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> -// %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor> -// %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x10x32x8x1xf32> -// %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32x4x1xf32> -// %19 = tensor.empty() : tensor<128x10x80x8x4xf32> -// %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> -// %21 = linalg.batch_mmt4d {__internal_linalg_transform__ = "workgroup"} ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> -// flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor> -// return -// } -// } -// } -// } - -hal.executable private @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 { -hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> { - hal.executable.export public @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.extui %0 : i32 to i64 - %5 = arith.extui %1 : i32 to i64 - %6 = arith.shli %5, %c32_i64 : i64 - %7 = arith.ori %4, %6 : i64 - %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index - %9 = arith.extui %2 : i32 to i64 - %10 = arith.extui %3 : i32 to i64 - %11 = arith.shli %10, %c32_i64 : i64 - %12 = arith.ori %9, %11 : i64 - %13 = arith.index_castui %12 : i64 to index - %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor> - %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x10x32x8x1xf32> - %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32x4x1xf32> - %19 = tensor.empty() : tensor<128x10x80x8x4xf32> - %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> - %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> - flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor> - return +#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +hal.executable private @batch_mmt4d { + hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { + hal.executable.export public @batch_mmt4d ordinal(0) layout(#pipeline_layout) + builtin.module { + func.func @batch_mmt4d() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load[0] : i32 + %1 = hal.interface.constant.load[1] : i32 + %2 = hal.interface.constant.load[2] : i32 + %3 = hal.interface.constant.load[3] : i32 + %4 = arith.extui %0 : i32 to i64 + %5 = arith.extui %1 : i32 to i64 + %6 = arith.shli %5, %c32_i64 : i64 + %7 = arith.ori %4, %6 : i64 + %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index + %9 = arith.extui %2 : i32 to i64 + %10 = arith.extui %3 : i32 to i64 + %11 = arith.shli %10, %c32_i64 : i64 + %12 = arith.ori %9, %11 : i64 + %13 = arith.index_castui %12 : i64 to index + %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor> + %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x10x32x8x1xf32> + %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32x4x1xf32> + %19 = tensor.empty() : tensor<128x10x80x8x4xf32> + %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor> + return + } } } } -} + +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK: func.func @batch_mmt4d() +// CHECK: linalg.batch_mmt4d +// CHECK-SAME: lowering_config = #[[CONFIG]]