Skip to content

Commit

Permalink
[CPU] Remove legacy logics from matmul peeling expert. (iree-org#16041)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhanW authored Jan 15, 2024
1 parent 863d302 commit f47b76f
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 37 deletions.
35 changes: 10 additions & 25 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,39 +776,24 @@ static LogicalResult setMatmulPeelingRootConfig(
func::FuncOp entryPointFn, linalg::ContractionOpInterface op,
ArrayRef<int64_t> distTileSizes, ArrayRef<int64_t> cacheTileSizes,
ArrayRef<int64_t> vecTileSizes, int vectorSize) {
// The tiling for parallel dims and reduction dims should be separated.
SmallVector<int64_t> parallelTileSizes(vecTileSizes.begin(),
vecTileSizes.end());
parallelTileSizes.back() = 0;

// Clamp inner tiling sizes to avoid masking. The vector masking takes the
// last level of tiling to create masks. It would lead to incorrect masking if
// the inner tiling sizes are not clamped. Because padding won't be applied
// along those dimensions.
for (const auto &[index, size] : llvm::enumerate(distTileSizes)) {
if (!size)
continue;
parallelTileSizes[index] = std::min(parallelTileSizes[index], size);
}

// TODO(hanchung): Make logic more heuristic. Peeling hurts performance a lot
// if the dim size is small (e.g., K=24).
// The tiling for parallel dims (M and N) and reduction dim (K) should be
// separated, so we move K dim from parallel tile sizes to reduction tile
// sizes.
int64_t numTilingDims = vecTileSizes.size();
SmallVector<int64_t> reductionTileSizes(numTilingDims - 1, 0);
auto lhsShapedType = llvm::cast<ShapedType>(op.lhs().getType());
int64_t K = lhsShapedType.getShape().back();
reductionTileSizes.push_back(
getMaxVectorTileSize(K, vecTileSizes.back(), vectorSize));

SmallVector<int64_t> cacheParallelTileSizes(cacheTileSizes.begin(),
cacheTileSizes.end());
SmallVector<int64_t> cacheReductionTileSizes(numTilingDims, 0);
std::swap(cacheParallelTileSizes.back(), cacheReductionTileSizes.back());

SmallVector<int64_t> vectorParallelTileSizes(vecTileSizes.begin(),
vecTileSizes.end());
SmallVector<int64_t> vectorReductionTileSizes(numTilingDims, 0);
std::swap(vectorParallelTileSizes.back(), vectorReductionTileSizes.back());

TileSizesListType tileSizes = {
SmallVector<int64_t>(distTileSizes), cacheParallelTileSizes,
cacheReductionTileSizes, parallelTileSizes, reductionTileSizes};

cacheReductionTileSizes, vectorParallelTileSizes,
vectorReductionTileSizes};
// No need for tiling inner parallel dims.
tileSizes.emplace_back(numTilingDims, 0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,23 +358,23 @@ hal.executable private @quant_matmul_fusion {
%c-128_i32 = arith.constant -128 : i32
%c127_i32 = arith.constant 127 : i32
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<24x144xi8>>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x49xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<49x144xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
%4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
%5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi8>>
%6 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
%7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>> -> tensor<2304x24xi8>
%8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<24x144xi8>> -> tensor<24x144xi8>
%7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x49xi8>> -> tensor<2304x49xi8>
%8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x144xi8>> -> tensor<49x144xi8>
%9 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
%10 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
%11 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
%12 = flow.dispatch.tensor.load %5, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi8>> -> tensor<144xi8>
%13 = tensor.empty() : tensor<2304x144xi8>
%14 = tensor.empty() : tensor<2304x144xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
%16 = linalg.matmul ins(%7, %8 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
%16 = linalg.matmul ins(%7, %8 : tensor<2304x49xi8>, tensor<49x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %16, %10, %11, %12 : tensor<144xi32>, tensor<2304x144xi32>, tensor<144xi32>, tensor<144xi32>, tensor<144xi8>) outs(%13 : tensor<2304x144xi8>) {
^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i32, %in_3: i8, %out: i8):
%18 = arith.muli %in_1, %c12_i32 : i32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,15 +386,15 @@ hal.executable private @preset_config_matmul_tensors {
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
hal.executable private @matmul_partially_pad {
hal.executable private @matmul_partially_peel {
hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
target_triple = "x86_64-unknown-linux-gnu",
native_vector_size = 16 : index
}>) {
hal.executable.export @matmul_partially_pad layout(#pipeline_layout)
hal.executable.export @matmul_partially_peel layout(#pipeline_layout)
builtin.module {
func.func @matmul_partially_pad() {
func.func @matmul_partially_peel() {
%cst = arith.constant 0.000000e+00 : f32
%lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<16641x16xf32>>
Expand All @@ -418,11 +418,11 @@ hal.executable private @matmul_partially_pad {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[129, 8, 0], [129, 8, 0], [0, 0, 0], [8, 8, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[129, 8, 0], [129, 8, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @matmul_partially_pad
// CHECK: hal.executable.export public @matmul_partially_peel
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @matmul_partially_pad
// CHECK: func.func @matmul_partially_peel
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]

Expand Down Expand Up @@ -1984,7 +1984,7 @@ hal.executable private @quant_model {
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [192, 144, 0], [0, 0, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [192, 144, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @quant_model
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down

0 comments on commit f47b76f

Please sign in to comment.