diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 7f60fc06b929..5d6cf1a34b97 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -776,39 +776,24 @@ static LogicalResult setMatmulPeelingRootConfig(
     func::FuncOp entryPointFn, linalg::ContractionOpInterface op,
     ArrayRef<int64_t> distTileSizes, ArrayRef<int64_t> cacheTileSizes,
     ArrayRef<int64_t> vecTileSizes, int vectorSize) {
-  // The tiling for parallel dims and reduction dims should be separated.
-  SmallVector<int64_t> parallelTileSizes(vecTileSizes.begin(),
-                                         vecTileSizes.end());
-  parallelTileSizes.back() = 0;
-
-  // Clamp inner tiling sizes to avoid masking. The vector masking takes the
-  // last level of tiling to create masks. It would lead to incorrect masking if
-  // the inner tiling sizes are not clamped. Because padding won't be applied
-  // along those dimensions.
-  for (const auto &[index, size] : llvm::enumerate(distTileSizes)) {
-    if (!size)
-      continue;
-    parallelTileSizes[index] = std::min(parallelTileSizes[index], size);
-  }
-
-  // TODO(hanchung): Make logic more heuristic. Peeling hurts performance a lot
-  // if the dim size is small (e.g., K=24).
+  // The tiling for parallel dims (M and N) and reduction dim (K) should be
+  // separated, so we move K dim from parallel tile sizes to reduction tile
+  // sizes.
   int64_t numTilingDims = vecTileSizes.size();
-  SmallVector<int64_t> reductionTileSizes(numTilingDims - 1, 0);
-  auto lhsShapedType = llvm::cast<ShapedType>(op.lhs().getType());
-  int64_t K = lhsShapedType.getShape().back();
-  reductionTileSizes.push_back(
-      getMaxVectorTileSize(K, vecTileSizes.back(), vectorSize));
-
   SmallVector<int64_t> cacheParallelTileSizes(cacheTileSizes.begin(),
                                               cacheTileSizes.end());
   SmallVector<int64_t> cacheReductionTileSizes(numTilingDims, 0);
   std::swap(cacheParallelTileSizes.back(), cacheReductionTileSizes.back());
 
+  SmallVector<int64_t> vectorParallelTileSizes(vecTileSizes.begin(),
+                                               vecTileSizes.end());
+  SmallVector<int64_t> vectorReductionTileSizes(numTilingDims, 0);
+  std::swap(vectorParallelTileSizes.back(), vectorReductionTileSizes.back());
+
   TileSizesListType tileSizes = {
       SmallVector<int64_t>(distTileSizes), cacheParallelTileSizes,
-      cacheReductionTileSizes, parallelTileSizes, reductionTileSizes};
-
+      cacheReductionTileSizes, vectorParallelTileSizes,
+      vectorReductionTileSizes};
   // No need for tiling inner parallel dims.
   tileSizes.emplace_back(numTilingDims, 0);
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index cac9fb004b6c..3cbe689b7b96 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -358,15 +358,15 @@ hal.executable private @quant_matmul_fusion {
         %c-128_i32 = arith.constant -128 : i32
         %c127_i32 = arith.constant 127 : i32
         %c0_i32 = arith.constant 0 : i32
-        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>>
-        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<24x144xi8>>
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x49xi8>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<49x144xi8>>
         %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
         %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
         %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
         %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi8>>
         %6 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
-        %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>> -> tensor<2304x24xi8>
-        %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<24x144xi8>> -> tensor<24x144xi8>
+        %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x49xi8>> -> tensor<2304x49xi8>
+        %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x144xi8>> -> tensor<49x144xi8>
         %9 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
         %10 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
         %11 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
@@ -374,7 +374,7 @@ hal.executable private @quant_matmul_fusion {
         %13 = tensor.empty() : tensor<2304x144xi8>
         %14 = tensor.empty() : tensor<2304x144xi32>
         %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
-        %16 = linalg.matmul ins(%7, %8 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
+        %16 = linalg.matmul ins(%7, %8 : tensor<2304x49xi8>, tensor<49x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
         %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %16, %10, %11, %12 : tensor<144xi32>, tensor<2304x144xi32>, tensor<144xi32>, tensor<144xi32>, tensor<144xi8>) outs(%13 : tensor<2304x144xi8>) {
         ^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i32, %in_3: i8, %out: i8):
           %18 = arith.muli %in_1, %c12_i32 : i32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index bc8d24f14dba..58cc56baf4d5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -386,15 +386,15 @@ hal.executable private @preset_config_matmul_tensors  {
     #hal.descriptor_set.binding<2, storage_buffer>
   ]>
 ]>
-hal.executable private @matmul_partially_pad  {
+hal.executable private @matmul_partially_peel  {
   hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
     data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
     target_triple = "x86_64-unknown-linux-gnu",
     native_vector_size = 16 : index
   }>) {
-    hal.executable.export @matmul_partially_pad layout(#pipeline_layout)
+    hal.executable.export @matmul_partially_peel layout(#pipeline_layout)
     builtin.module {
-      func.func @matmul_partially_pad() {
+      func.func @matmul_partially_peel() {
         %cst = arith.constant 0.000000e+00 : f32
         %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
             : !flow.dispatch.tensor<readonly:tensor<16641x16xf32>>
@@ -418,11 +418,11 @@ hal.executable private @matmul_partially_pad  {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[129, 8, 0], [129, 8, 0], [0, 0, 0], [8, 8, 0], [0, 0, 16], [0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[129, 8, 0], [129, 8, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-//      CHECK: hal.executable.export public @matmul_partially_pad
+//      CHECK: hal.executable.export public @matmul_partially_peel
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//      CHECK: func.func @matmul_partially_pad
+//      CHECK: func.func @matmul_partially_peel
 //      CHECK:   linalg.matmul
 // CHECK-SAME:       lowering_config = #[[CONFIG]]
 
@@ -1984,7 +1984,7 @@ hal.executable private @quant_model {
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [192, 144, 0], [0, 0, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [192, 144, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @quant_model
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]