diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 7f60fc06b929..5d6cf1a34b97 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -776,39 +776,24 @@ static LogicalResult setMatmulPeelingRootConfig( func::FuncOp entryPointFn, linalg::ContractionOpInterface op, ArrayRef distTileSizes, ArrayRef cacheTileSizes, ArrayRef vecTileSizes, int vectorSize) { - // The tiling for parallel dims and reduction dims should be separated. - SmallVector parallelTileSizes(vecTileSizes.begin(), - vecTileSizes.end()); - parallelTileSizes.back() = 0; - - // Clamp inner tiling sizes to avoid masking. The vector masking takes the - // last level of tiling to create masks. It would lead to incorrect masking if - // the inner tiling sizes are not clamped. Because padding won't be applied - // along those dimensions. - for (const auto &[index, size] : llvm::enumerate(distTileSizes)) { - if (!size) - continue; - parallelTileSizes[index] = std::min(parallelTileSizes[index], size); - } - - // TODO(hanchung): Make logic more heuristic. Peeling hurts performance a lot - // if the dim size is small (e.g., K=24). + // The tiling for parallel dims (M and N) and reduction dim (K) should be + // separated, so we move K dim from parallel tile sizes to reduction tile + // sizes. int64_t numTilingDims = vecTileSizes.size(); - SmallVector reductionTileSizes(numTilingDims - 1, 0); - auto lhsShapedType = llvm::cast(op.lhs().getType()); - int64_t K = lhsShapedType.getShape().back(); - reductionTileSizes.push_back( - getMaxVectorTileSize(K, vecTileSizes.back(), vectorSize)); - SmallVector cacheParallelTileSizes(cacheTileSizes.begin(), cacheTileSizes.end()); SmallVector cacheReductionTileSizes(numTilingDims, 0); std::swap(cacheParallelTileSizes.back(), cacheReductionTileSizes.back()); + SmallVector vectorParallelTileSizes(vecTileSizes.begin(), + vecTileSizes.end()); + SmallVector vectorReductionTileSizes(numTilingDims, 0); + std::swap(vectorParallelTileSizes.back(), vectorReductionTileSizes.back()); + TileSizesListType tileSizes = { SmallVector(distTileSizes), cacheParallelTileSizes, - cacheReductionTileSizes, parallelTileSizes, reductionTileSizes}; - + cacheReductionTileSizes, vectorParallelTileSizes, + vectorReductionTileSizes}; // No need for tiling inner parallel dims. tileSizes.emplace_back(numTilingDims, 0); diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir index cac9fb004b6c..3cbe689b7b96 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir @@ -358,15 +358,15 @@ hal.executable private @quant_matmul_fusion { %c-128_i32 = arith.constant -128 : i32 %c127_i32 = arith.constant 127 : i32 %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> %6 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2304x24xi8> - %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<24x144xi8> + %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2304x49xi8> + %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x144xi8> %9 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor> -> tensor<144xi32> %10 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor> -> tensor<144xi32> %11 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor> -> tensor<144xi32> @@ -374,7 +374,7 @@ hal.executable private @quant_matmul_fusion { %13 = tensor.empty() : tensor<2304x144xi8> %14 = tensor.empty() : tensor<2304x144xi32> %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<2304x144xi32>) -> tensor<2304x144xi32> - %16 = linalg.matmul ins(%7, %8 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32> + %16 = linalg.matmul ins(%7, %8 : tensor<2304x49xi8>, tensor<49x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32> %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %16, %10, %11, %12 : tensor<144xi32>, tensor<2304x144xi32>, tensor<144xi32>, tensor<144xi32>, tensor<144xi8>) outs(%13 : tensor<2304x144xi8>) { ^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i32, %in_3: i8, %out: i8): %18 = arith.muli %in_1, %c12_i32 : i32 diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir index bc8d24f14dba..58cc56baf4d5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir @@ -386,15 +386,15 @@ hal.executable private @preset_config_matmul_tensors { #hal.descriptor_set.binding<2, storage_buffer> ]> ]> -hal.executable private @matmul_partially_pad { +hal.executable private @matmul_partially_peel { hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", { data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", target_triple = "x86_64-unknown-linux-gnu", native_vector_size = 16 : index }>) { - hal.executable.export @matmul_partially_pad layout(#pipeline_layout) + hal.executable.export @matmul_partially_peel layout(#pipeline_layout) builtin.module { - func.func @matmul_partially_pad() { + func.func @matmul_partially_peel() { %cst = arith.constant 0.000000e+00 : f32 %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> @@ -418,11 +418,11 @@ hal.executable private @matmul_partially_pad { } } } -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: hal.executable.export public @matmul_partially_pad +// CHECK: hal.executable.export public @matmul_partially_peel // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK: func.func @matmul_partially_pad +// CHECK: func.func @matmul_partially_peel // CHECK: linalg.matmul // CHECK-SAME: lowering_config = #[[CONFIG]] @@ -1984,7 +1984,7 @@ hal.executable private @quant_model { } } -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: hal.executable.export public @quant_model // CHECK-SAME: translation_info = #[[TRANSLATION]]