Skip to content

Commit

Permalink
Fix tile sizes and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Jerry Wu committed Aug 25, 2023
1 parent 90a3135 commit dc19b29
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 86 deletions.
8 changes: 3 additions & 5 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1154,8 +1154,8 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
auto getDistTileSizes = [&]() -> SmallVector<int64_t> {
if (!mmt4dDistributionTileSizes.empty()) {
SmallVector<int64_t> tileSizes;
// If mmt4dDistributionTileSizes is set, tile batch dim to 1 + specified
// tile sizes.
// If mmt4dDistributionTileSizes is set, tile batch dim to 1 + the
// specified mmt4d tile sizes.
tileSizes.push_back(1);
tileSizes.append(mmt4dDistributionTileSizes.begin(),
mmt4dDistributionTileSizes.end());
Expand Down Expand Up @@ -1195,9 +1195,7 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
int M0 = lhsShape[3];
int N0 = rhsShape[3];
int K0 = lhsShape[4];
tileSizes.push_back(M0);
tileSizes.push_back(N0);
tileSizes.push_back(K0);
tileSizes.append({1, 1, 1, M0, N0, K0});
return tileSizes;
};

Expand Down
126 changes: 45 additions & 81 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -46,87 +46,51 @@ hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 {

// -----

// #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
// #pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
// #hal.descriptor_set.layout<0, bindings = [
// #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
// #hal.descriptor_set.binding<1, storage_buffer>
// ]>
// ]>
// hal.executable private @batch_matmul {
// hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
// hal.executable.export public @batch_matmul ordinal(0) layout(#pipeline_layout)
// builtin.module {
// func.func @batch_matmul() {
// %c32_i64 = arith.constant 32 : i64
// %cst = arith.constant 0.000000e+00 : f32
// %c0 = arith.constant 0 : index
// %0 = hal.interface.constant.load[0] : i32
// %1 = hal.interface.constant.load[1] : i32
// %2 = hal.interface.constant.load[2] : i32
// %3 = hal.interface.constant.load[3] : i32
// %4 = arith.extui %0 : i32 to i64
// %5 = arith.extui %1 : i32 to i64
// %6 = arith.shli %5, %c32_i64 : i64
// %7 = arith.ori %4, %6 : i64
// %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
// %9 = arith.extui %2 : i32 to i64
// %10 = arith.extui %3 : i32 to i64
// %11 = arith.shli %10, %c32_i64 : i64
// %12 = arith.ori %9, %11 : i64
// %13 = arith.index_castui %12 : i64 to index
// %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
// %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
// %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
// %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
// %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
// %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
// %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
// %21 = linalg.batch_mmt4d {__internal_linalg_transform__ = "workgroup"} ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
// flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
// return
// }
// }
// }
// }

hal.executable private @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 {
hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> {
hal.executable.export public @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 28, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_2_batch_matmul_DxDxDxD_f32() {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
%15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
%19 = tensor.empty() : tensor<128x10x80x8x4xf32>
%20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
%21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
return
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
hal.executable private @batch_mmt4d {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @batch_mmt4d ordinal(0) layout(#pipeline_layout)
builtin.module {
func.func @batch_mmt4d() {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
%15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
%19 = tensor.empty() : tensor<128x10x80x8x4xf32>
%20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
%21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
return
}
}
}
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 20, 0, 0, 0, 0], [1, 1, 1, 0, 8, 4, 0], [0, 0, 0, 1, 0, 0, 1]{{\]}}>
// CHECK: func.func @batch_mmt4d()
// CHECK: linalg.batch_mmt4d
// CHECK-SAME: lowering_config = #[[CONFIG]]

0 comments on commit dc19b29

Please sign in to comment.