diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index fc7859be5a0f..11cf13cc35c2 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -62,7 +62,6 @@
 /compiler/src/iree/compiler/Codegen/LLVMCPU/ @hanhanW @MaheshRavishankar
 /compiler/src/iree/compiler/Codegen/LLVMGPU/ @MaheshRavishankar @qedawkins @kuhar @Groverkss
 /compiler/src/iree/compiler/Codegen/SPIRV/ @antiagainst @MaheshRavishankar @kuhar
-/compiler/src/iree/compiler/Codegen/TransformStrategies/ @qedawkins @MaheshRavishankar
 /compiler/src/iree/compiler/ConstEval/ @hanhanW @stellaraccident
 /compiler/src/iree/compiler/Dialect/Encoding/ @bjacob @hanhanW
 /compiler/src/iree/compiler/Dialect/Flow/ @hanhanW @MaheshRavishankar @IanWood1
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
index 616e39394d4f..7aca986d540b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -307,8 +307,6 @@ iree_compiler_cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:DialectUtils",
-        # TransformStrategies
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies",
         # TransformExtensions (needed for registration in the pass)
         "//llvm-external-projects/iree-dialects:IREEDialectsTransforms",
         "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index 648805b515ee..764bc258c902 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -283,7 +283,6 @@ iree_cc_library(
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
     iree::compiler::Codegen::LLVMCPU::TransformExtensions::LLVMCPUExtensions
     iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions
-    iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies
     iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::Flow::TransformExtensions::FlowExtensions
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index 2a96c4beb54a..e2e15dbbfa68 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -93,7 +93,6 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common/CPU:CommonCPUPasses",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/CPU",
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 8db7e3770149..9eb8dc155d0c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -151,7 +151,6 @@ iree_cc_library(
     iree::compiler::Codegen::Common::TransformDialectInterpreterPass
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
-    iree::compiler::Codegen::TransformStrategies::CPU
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Dialect::Flow::IR
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index c68c905c14d1..6f9983454af5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -11,7 +11,6 @@
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/LLVMCPU/TargetMLTransformInfo.h"
 #include "iree/compiler/Codegen/LLVMCPU/Utils.h"
-#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h"
 #include "iree/compiler/Codegen/Utils/CPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -100,12 +99,6 @@ static llvm::cl::opt<bool> clDisableArmSMETiling(
                    "target (i.e., when the +sme feature flag is present)"),
     llvm::cl::init(false));
 
-// Non-static options are used in other places.
-llvm::cl::opt<bool> clEnableTransformDialectJit(
-    "iree-llvmcpu-enable-transform-dialect-jit",
-    llvm::cl::desc("enable the usage of the transform dialect JIT"),
-    llvm::cl::init(false));
-
 using IREE::Codegen::DispatchLoweringPassPipeline;
 
 // Encodes the pre-processing strategy to be applied on a Linalg operation
@@ -2007,28 +2000,6 @@ setDefaultGenericOpRootConfig(mlir::FunctionOpInterface entryPointFn,
       /*subgroupSize=*/{}, pipelineConfig);
 }
 
-/// Set lowering info to be used by the transform dialect jitter.
-static LogicalResult
-setTransformStrategyRootConfig(mlir::FunctionOpInterface entryPointFn,
-                               linalg::GenericOp genericOp,
-                               const LinalgOpInfo &linalgOpInfo,
-                               const TargetMLTransformInfo &targetMLTransInfo) {
-  assert(!getLoweringConfig(genericOp) &&
-         "expected lowering_config is not set");
-  if (!clEnableTransformDialectJit)
-    return failure();
-  cpu::CPUModel cpuModel;
-  if (failed(
-          cpu::matchAndSetReductionStrategy(entryPointFn, genericOp, cpuModel)))
-    return failure();
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPointFn->getContext(),
-      IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen);
-  if (failed(setTranslationInfo(entryPointFn, translationInfo)))
-    return failure();
-  return success();
-}
-
 /// Utility to return the transpose vector `sizes` for X86. Empty `sizes` on
 /// return indicates failure.
 static void getTransposeX86VectorSizes(
@@ -2284,11 +2255,6 @@ setRootConfig(mlir::FunctionOpInterface entryPointFn,
               const TargetMLTransformInfo &targetMLTransInfo) {
   assert(!getLoweringConfig(genericOp) &&
          "expected lowering_config is not set");
-  // First, try to apply the transform dialect strategy, if defined.
-  if (succeeded(setTransformStrategyRootConfig(
-          entryPointFn, genericOp, linalgOpInfo, targetMLTransInfo))) {
-    return success();
-  }
 
   if (succeeded(setTransposeLikeOpRootConfig(
           entryPointFn, genericOp, linalgOpInfo, targetMLTransInfo))) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
index aeb2b6443a0d..6e64454e0ce3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
@@ -10,16 +10,6 @@
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/LLVMCPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMCPU/Utils.h"
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/PDL/IR/PDL.h"
-#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -38,22 +28,7 @@ class LLVMCPUSelectLoweringStrategyPass
           LLVMCPUSelectLoweringStrategyPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    // TODO(qedawkins): Once TransformStrategies is deprecated, drop the
-    // unnecessary dialect registrations.
-    // clang-format off
-    registry.insert<IREE::Codegen::IREECodegenDialect,
-                    IREE::HAL::HALDialect,
-                    IREE::LinalgExt::IREELinalgExtDialect,
-                    bufferization::BufferizationDialect,
-                    linalg::LinalgDialect,
-                    LLVM::LLVMDialect,
-                    pdl::PDLDialect,
-                    pdl_interp::PDLInterpDialect,
-                    scf::SCFDialect,
-                    tensor::TensorDialect,
-                    transform::TransformDialect,
-                    vector::VectorDialect>();
-    // clang-format on
+    registry.insert<IREE::Codegen::IREECodegenDialect>();
   }
 
   void runOnOperation() override;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index 3dd48f9e88fb..b074612adbc5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -140,7 +140,6 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/Utils",
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU",
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 8657fc8cf2ce..6a92f60d7f04 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -183,7 +183,6 @@ iree_cc_library(
     iree::compiler::Codegen::Interfaces::UKernelOpInterface
     iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions
     iree::compiler::Codegen::LLVMGPU::Utils
-    iree::compiler::Codegen::TransformStrategies::GPU
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 807b9fdb84db..ff002ace5b0f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -18,7 +18,6 @@
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -63,11 +62,6 @@ llvm::cl::opt<bool> clGPUEnableVectorDistribution(
     llvm::cl::desc("enable the usage of the vector distribution pipeline"),
     llvm::cl::init(true));
 
-llvm::cl::opt<bool> clGPUEnableTransformDialectJit(
-    "iree-codegen-llvmgpu-enable-transform-dialect-jit",
-    llvm::cl::desc("enable the usage of the transform dialect JIT"),
-    llvm::cl::init(false));
-
 /// Flag to force using WMMA tensorcore operations.
 llvm::cl::opt<bool>
     clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
@@ -1392,57 +1386,6 @@ static LogicalResult setRootDefaultConfig(IREE::GPU::TargetAttr target,
                                                preferredSubgroupSize);
 }
 
-//====---------------------------------------------------------------------===//
-// Transform Dialect Pipeline Configuration
-//====---------------------------------------------------------------------===//
-
-/// Set configuration for transform dialect based strategies.
-static LogicalResult
-setTransformDialectConfig(IREE::GPU::TargetAttr target,
-                          mlir::FunctionOpInterface entryPoint, Operation *op) {
-  if (!clGPUEnableTransformDialectJit) {
-    return failure();
-  }
-
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPoint.getContext(), CodeGenPipeline::TransformDialectCodegen);
-
-  // TODO: unify the target informations into one structure.
-  iree_compiler::gpu::GPUModel gpuModel;
-  gpuModel.hasWarpShuffle = target.supportsSubgroupShuffle();
-  gpuModel.hasTF32TensorCore = target.supportsTF32InputMMAOps();
-  gpuModel.hasMmaSync = target.supportsSyncMMAOps();
-
-  // Populates a subset of the fragment combinations supported in MLIR lowerings
-  // to NVVM (which is itself a subset of what LLVM supports) based on what the
-  // pipeline currently supports.
-  // TODO: avoid hard coding this and populate based on hardware capabilities.
-  // TODO: add missing supported configs once the pipeline supports it.
-  MLIRContext *context = entryPoint.getContext();
-  Type f32Type = Float32Type::get(context);
-  Type f16Type = Float16Type::get(context);
-
-  iree_compiler::gpu::MMAConfig f16f32AccConfig = {
-      /*m=*/16,          /*n=*/16,          /*k=*/16,
-      /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f32Type};
-  iree_compiler::gpu::MMAConfig f16f16AccConfig = {
-      /*m=*/16,          /*n=*/16,          /*k=*/16,
-      /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f16Type};
-  gpuModel.supportedWMMAConfigs = {f16f32AccConfig, f16f16AccConfig};
-
-  if (target.supportsTF32InputMMAOps()) {
-    iree_compiler::gpu::MMAConfig tf32WmmaConfig = {
-        /*m=*/16,          /*n=*/16,          /*k=*/8,
-        /*aType=*/f32Type, /*bType=*/f32Type, /*cType=*/f32Type};
-    gpuModel.supportedWMMAConfigs.push_back(tf32WmmaConfig);
-  }
-
-  if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op,
-                                                              gpuModel)))
-    return failure();
-  return setTranslationInfo(entryPoint, translationInfo);
-}
-
 static bool isMatvecLike(linalg::LinalgOp linalgOp) {
   if (linalgOp.getNumParallelLoops() != 2)
     return false;
@@ -2015,11 +1958,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
     computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
     llvm::dbgs() << "\n";
   });
-  // First try to see if there is a transform dialect configuration existing.
-  if (succeeded(setTransformDialectConfig(target, entryPointFn, computeOp))) {
-    LDBG("Transform Dialect Config");
-    return success();
-  }
   if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
                                                    computeOp))) {
     LDBG("Tile and fuse data tiled multi_mma config");
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
index a6d630717bb6..396bbd96e825 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
@@ -8,18 +8,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Dialect/PDL/IR/PDL.h"
-#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -41,24 +29,8 @@ class LLVMGPUSelectLoweringStrategyPass final
       LLVMGPUSelectLoweringStrategyPass>::LLVMGPUSelectLoweringStrategyPassBase;
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    // TODO(qedawkins): Once TransformStrategies is deprecated, drop the
-    // unnecessary dialect registrations.
-    // clang-format off
     registry
-        .insert<IREE::Codegen::IREECodegenDialect,
-                IREE::GPU::IREEGPUDialect,
-                IREE::HAL::HALDialect,
-                IREE::LinalgExt::IREELinalgExtDialect,
-                linalg::LinalgDialect,
-                gpu::GPUDialect,
-                nvgpu::NVGPUDialect,
-                pdl::PDLDialect,
-                pdl_interp::PDLInterpDialect,
-                scf::SCFDialect,
-                tensor::TensorDialect,
-                transform::TransformDialect,
-                vector::VectorDialect>();
-    // clang-format on
+        .insert<IREE::Codegen::IREECodegenDialect, IREE::GPU::IREEGPUDialect>();
   }
 
   void runOnOperation() override;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 40945f27454c..00bc6f967acf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -43,13 +43,8 @@ iree_lit_test_suite(
             "nvvm_mma_sync_pipeline_test.mlir",
             "reduction_pipeline_cuda.mlir",
             "reduction_pipeline_rocm.mlir",
-            "reduction_pipeline_transform_cuda.mlir",
-            "reduction_pipeline_transform_rocm.mlir",
+            "reduction_pipeline_softmax_rocm.mlir",
             "rocdl_pipeline_test.mlir",
-            "set_transform_strategy_batch_matmul.mlir",
-            "set_transform_strategy_convolution.mlir",
-            "set_transform_strategy_matmul.mlir",
-            "set_transform_strategy_pad.mlir",
             "illegal_configuration.mlir",
             "legalize.mlir",
             "linalg_transform.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index b771513dd764..6be97c06d533 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -49,13 +49,8 @@ iree_lit_test_suite(
     "promote_matmul_to_fit_mma.mlir"
     "reduction_pipeline_cuda.mlir"
     "reduction_pipeline_rocm.mlir"
-    "reduction_pipeline_transform_cuda.mlir"
-    "reduction_pipeline_transform_rocm.mlir"
+    "reduction_pipeline_softmax_rocm.mlir"
     "rocdl_pipeline_test.mlir"
-    "set_transform_strategy_batch_matmul.mlir"
-    "set_transform_strategy_convolution.mlir"
-    "set_transform_strategy_matmul.mlir"
-    "set_transform_strategy_pad.mlir"
     "tensor_pad.mlir"
     "tensorcore_vectorization.mlir"
     "transform_dialect_bufferize.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 50d989599819..43a7164ac441 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
-// RUN:   --iree-codegen-llvmgpu-enable-transform-dialect-jit=false --iree-gpu-test-target=sm_60 %s | FileCheck %s
+// RUN:   --iree-gpu-test-target=sm_60 %s | FileCheck %s
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
-// RUN:   --iree-codegen-llvmgpu-enable-transform-dialect-jit=false --iree-gpu-test-target=sm_80 %s | FileCheck %s --check-prefix=SM80
+// RUN:   --iree-gpu-test-target=sm_80 %s | FileCheck %s --check-prefix=SM80
 
 // Transform dialect attributes are tested separately.
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index f292df715093..cfa8875c9685 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -1,12 +1,10 @@
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \
 // RUN:     --iree-gpu-test-target=sm_60 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir@__transform_main | \
 // RUN: FileCheck %s
 
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \
 // RUN:     --iree-gpu-test-target=sm_60 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \
 // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
index cbab841c3f27..e7aaae07cdcb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -37,66 +37,27 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
 }
 }
 
-//         CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
-//         CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<None workgroup_size = [256, 1, 1] subgroup_size = 32>
+//         CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [256, 1, 1] subgroup_size = 32>
 //         CHECK:  func.func @warp_reduction_dispatch()
 //    CHECK-SAME:      translation_info = #[[TRANSLATION_INFO]]
-//     CHECK-DAG:    %[[C0I:.+]] = arith.constant 0 : i32
-//     CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
-//     CHECK-DAG:    %[[C1:.+]] = arith.constant 1 : i32
-//     CHECK-DAG:    %[[C2:.+]] = arith.constant 2 : i32
-//     CHECK-DAG:    %[[C4:.+]] = arith.constant 4 : i32
-//     CHECK-DAG:    %[[C8:.+]] = arith.constant 8 : i32
-//     CHECK-DAG:    %[[C16:.+]] = arith.constant 16 : i32
-//     CHECK-DAG:    %[[C32:.+]] = arith.constant 32 : i32
-//     CHECK-DAG:    %[[C32I:.+]] = arith.constant 32 : index
-//     CHECK-DAG:    %[[C1024:.+]] = arith.constant 1024 : index
-//     CHECK-DAG:    %[[C10240:.+]] = arith.constant 10240 : index
-//     CHECK-DAG:    %[[IDENTITY:.+]] = arith.constant 0.000000e+00 : f32
-//     CHECK-DAG:    %[[CF:.+]] = arith.constant 1.000000e+00 : f32
-//     CHECK-DAG:    %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+//     CHECK-DAG:    %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
 //     CHECK-DAG:    %[[TID:.+]] = gpu.thread_id  x
-//         CHECK:    %[[TID4:.+]] = affine.apply #[[$MAP]]()[%[[TID]]]
-//         CHECK:    %[[R0:.+]] = scf.for %{{.*}} = %[[TID4]] to %[[C10240]] step %[[C1024]] iter_args(%[[A0:.+]] = %[[CST]]) -> (vector<1xf32>) {
-//         CHECK:      %[[V:.+]] = vector.transfer_read {{.*}} {in_bounds = [true]} : memref<512x10240xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
-//         CHECK:      %[[E:.+]] = vector.extract %[[A0]][0] : f32 from vector<1xf32>
-//         CHECK:      %[[RL:.+]] = vector.reduction <add>, %[[V]], %[[E]] : vector<4xf32> into f32
-//         CHECK:      %[[B:.+]] = vector.broadcast %[[RL:.*]] : f32 to vector<1xf32>
-//         CHECK:      scf.yield %[[B]] : vector<1xf32>
+//         CHECK:    %[[R0:.+]] = scf.for %{{.*}} = %c0 to %c10240 step %c1024 iter_args(%[[A0:.+]] = %[[CST]]) -> (vector<4xf32>) {
+//         CHECK:      %[[V:.+]] = vector.transfer_read {{.*}} : memref<512x10240xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
+//         CHECK:      %[[ADD:.+]] = arith.addf %[[V]], %[[A0]] : vector<4xf32>
+//         CHECK:      scf.yield %[[ADD]] : vector<4xf32>
 //         CHECK:    }
-//         CHECK:    %[[R1:.+]] = vector.extract %[[R0]][0] : f32 from vector<1xf32>
-//         CHECK:    %[[S0:.+]], %{{.*}} = gpu.shuffle  xor %[[R1]], %[[C1]], %[[C32]] : f32
-//         CHECK:    %[[R2:.+]] = arith.addf %[[R1]], %[[S0]] : f32
-//         CHECK:    %[[S1:.+]], %{{.*}} = gpu.shuffle  xor %[[R2]], %[[C2]], %[[C32]] : f32
-//         CHECK:    %[[R3:.+]] = arith.addf %[[R2]], %[[S1]] : f32
-//         CHECK:    %[[S2:.+]], %{{.*}} = gpu.shuffle  xor %[[R3]], %[[C4]], %[[C32]] : f32
-//         CHECK:    %[[R4:.+]] = arith.addf %[[R3]], %[[S2]] : f32
-//         CHECK:    %[[S3:.+]], %{{.*}} = gpu.shuffle  xor %[[R4]], %[[C8]], %[[C32]] : f32
-//         CHECK:    %[[R5:.+]] = arith.addf %[[R4]], %[[S3]] : f32
-//         CHECK:    %[[S4:.+]], %{{.*}} = gpu.shuffle  xor %[[R5]], %[[C16]], %[[C32]] : f32
-//         CHECK:    %[[R6:.+]] = arith.addf %[[R5]], %[[S4]] : f32
+// CHECK-COUNT-5:    gpu.shuffle  xor {{.*}} : f32
 //         CHECK:    %[[ALLOC:.+]] = memref.alloc() : memref<8xf32, #gpu.address_space<workgroup>>
-//         CHECK:    %[[WID:.+]] = arith.divui %{{.*}}, %{{.*}} : index
-//         CHECK:    %[[LANE_ID:.*]] = arith.remui %[[TID]], %[[C32I]] : index
-//         CHECK:    %[[LANE0:.*]] = arith.cmpi eq, %[[LANE_ID]], %[[C0]] : index
-//         CHECK:    scf.if %[[LANE0]] {
-//         CHECK:      memref.store %[[R6]], %[[ALLOC]][%[[WID]]] : memref<8xf32, #gpu.address_space<workgroup>>
+//         CHECK:    scf.if %{{.*}} {
+//         CHECK:      memref.store %{{.*}}, %[[ALLOC]]{{.*}} : memref<8xf32, #gpu.address_space<workgroup>>
 //         CHECK:    }
 //         CHECK:    gpu.barrier
-//         CHECK:    %[[LANE_ID_IN_BOUNDS:.*]] = arith.minui %[[LANE_ID]]
-//         CHECK:    %[[LOAD_VAL:.+]] = memref.load %[[ALLOC]][%[[LANE_ID_IN_BOUNDS]]] : memref<8xf32, #gpu.address_space<workgroup>>
-//         CHECK:    %[[S5:.+]], %{{.*}} = gpu.shuffle  xor %[[LOAD_VAL]], %[[C1]], %[[C32]] : f32
-//         CHECK:    %[[R7:.+]] = arith.addf %[[LOAD_VAL]], %[[S5]] : f32
-//         CHECK:    %[[S6:.+]], %{{.*}} = gpu.shuffle  xor %[[R7]], %[[C2]], %[[C32]] : f32
-//         CHECK:    %[[R8:.+]] = arith.addf %[[R7]], %[[S6]] : f32
-//         CHECK:    %[[S7:.+]], %{{.*}} = gpu.shuffle  xor %[[R8]], %[[C4]], %[[C32]] : f32
-//         CHECK:    %[[R9:.+]] = arith.addf %[[R8]], %[[S7]] : f32
-//         CHECK:    %[[S9:.+]], %{{.*}} = gpu.shuffle  idx %[[R9]], %[[C0I]], %[[C32]] : f32
-//         CHECK:    %[[R12:.+]] = arith.addf %[[S9]], %[[CF]] : f32
-//         CHECK:    %[[R13:.+]] = vector.broadcast %[[R12]] : f32 to vector<1xf32>
-//         CHECK:    %[[TID0:.+]] = arith.cmpi eq, %[[TID]], %[[C0]] : index
-//         CHECK:    scf.if %[[TID0]] {
-//         CHECK:      vector.transfer_write %[[R13]], %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<512xf32, #hal.descriptor_type<storage_buffer>>
+//         CHECK:    memref.load %[[ALLOC]]{{.*}} : memref<8xf32, #gpu.address_space<workgroup>>
+// CHECK-COUNT-3:    gpu.shuffle  xor {{.*}} : f32
+//         CHECK:    gpu.shuffle  idx {{.*}} : f32
+//         CHECK:    scf.if %{{.*}} {
+//         CHECK:      vector.transfer_write {{.*}} : vector<1xf32>, memref<512xf32, #hal.descriptor_type<storage_buffer>>
 //         CHECK:    }
 
 // -----
@@ -149,43 +110,21 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
 }
 }
 
-//         CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<None workgroup_size = [512, 1, 1] subgroup_size = 32>
+//         CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [256, 1, 1] subgroup_size = 32>
 //         CHECK:  func.func @warp_reduction_broadcast_dispatch()
 //    CHECK-SAME:      translation_info = #[[TRANSLATION_INFO]]
-//         CHECK:    scf.for {{.*}} -> (vector<1xf32>) {
+//         CHECK:    scf.for {{.*}} -> (vector<4xf32>) {
 //         CHECK:      vector.transfer_read {{.*}} : memref<512x10240xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
-//         CHECK:      vector.reduction <add>, {{.*}} : vector<4xf32> into f32
+//         CHECK:      arith.addf {{.*}} : vector<4xf32>
 //         CHECK:      scf.yield
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    arith.remui
+// CHECK-COUNT-5:    gpu.shuffle  xor
 //         CHECK:    scf.if
-//         CHECK:      memref.store {{.*}} : memref<16xf32, #gpu.address_space<workgroup>>
+//         CHECK:      memref.store {{.*}} : memref<8xf32, #gpu.address_space<workgroup>>
 //         CHECK:    }
-//         CHECK:    gpu.barrier
-//         CHECK:    arith.minui
-//         CHECK:    memref.load
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    gpu.shuffle  xor
-//         CHECK:    arith.addf
-//         CHECK:    arith.addf
-//         CHECK:    vector.broadcast %{{.*}} : f32 to vector<1xf32>
+// CHECK-COUNT-3:    gpu.shuffle  xor
+//         CHECK:    gpu.shuffle  idx
+//         CHECK:    arith.divf {{.*}} : vector<4xf32>
 //         CHECK:    scf.for
-//         CHECK:      vector.transfer_read
-//         CHECK:      arith.divf {{.*}} : vector<4x1xf32>
 //         CHECK:      vector.transfer_write {{.*}} : vector<4xf32>, memref<512x10240xf32, #hal.descriptor_type<storage_buffer>>
 //         CHECK:    }
 //         CHECK:    return
@@ -300,3 +239,340 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
 //         CHECK:      vector.transfer_write
 //         CHECK:    }
 //         CHECK:    return
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @small_reduction {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @small_reduction ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @small_reduction() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x13xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024xf32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 13], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x13xf32>> -> tensor<1024x13xf32>
+      %3 = tensor.empty() : tensor<1024xf32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1024xf32>) -> tensor<1024xf32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<1024x13xf32>) outs(%4 : tensor<1024xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<1024xf32>
+      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024xf32>>
+      return
+    }
+  }
+}
+}
+
+// Small reduction computes the whole reduction on a single thread.
+//   CHECK-LABEL: func.func @small_reduction
+//         CHECK: scf.for %{{.*}} = %c0 to %c13 step %c4
+//         CHECK:   linalg.generic
+//         CHECK:     arith.addf
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_reduction {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @group_reduction ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_reduction() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x64xf32>> -> tensor<8x64xf32>
+      %3 = tensor.empty() : tensor<8xf32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<8xf32>
+      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+      return
+    }
+  }
+}
+}
+
+//   CHECK-LABEL: func.func @group_reduction
+//         CHECK:   %[[RD:.+]] = vector.transfer_read {{.*}} memref<8x64xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+//         CHECK:   %[[ADD:.+]] = arith.addf %[[RD]]
+//         CHECK:   vector.reduction <add>, %[[ADD]]
+// CHECK-COUNT-5:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+//         CHECK:   scf.if
+//         CHECK:     vector.transfer_write {{.*}} memref<8xf32, #hal.descriptor_type<storage_buffer>>
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_elementwise_reduction_elementwise {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_elementwise_reduction_elementwise() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x64xf32>> -> tensor<8x64xf32>
+      %3 = tensor.empty() : tensor<8xf32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %7 = arith.addf %in, %in : f32
+        %8 = arith.addf %7, %7 : f32
+        %9 = arith.addf %8, %out : f32
+        linalg.yield %9 : f32
+      } -> tensor<8xf32>
+      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<8xf32>) outs(%3 : tensor<8xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %7 = math.sqrt %in : f32
+        linalg.yield %7 : f32
+      } -> tensor<8xf32>
+      flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+      return
+    }
+  }
+}
+}
+
+//   CHECK-LABEL: func.func @group_elementwise_reduction_elementwise
+//         CHECK:   vector.transfer_read {{.*}} vector<2xf32>
+//         CHECK:   arith.addf{{.*}} : vector<2xf32>
+//         CHECK:   arith.addf{{.*}} : vector<2xf32>
+//         CHECK:   arith.addf{{.*}} : vector<2xf32>
+// CHECK-COUNT-5:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+//         CHECK:   %[[SQRT_VEC:.+]] = math.sqrt
+//         CHECK:   scf.if
+//         CHECK:     vector.transfer_write %[[SQRT_VEC]], {{.*}} : vector<1xf32>, memref<8xf32, #hal.descriptor_type<storage_buffer>>
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_reduction_larger {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @group_reduction_larger ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_reduction_larger() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x1024xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<33xf32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x1024xf32>> -> tensor<33x1024xf32>
+      %3 = tensor.empty() : tensor<33xf32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<33xf32>) -> tensor<33xf32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<33x1024xf32>) outs(%4 : tensor<33xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<33xf32>
+      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [33], strides = [1] : tensor<33xf32> -> !flow.dispatch.tensor<writeonly:tensor<33xf32>>
+      return
+    }
+  }
+}
+}
+
+//   CHECK-LABEL: func.func @group_reduction_larger
+// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+//         CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8xf32, #gpu.address_space<workgroup>>
+//         CHECK: scf.if
+//         CHECK:   memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<8xf32, #gpu.address_space<workgroup>>
+//         CHECK: }
+//         CHECK: arith.minui
+//         CHECK: memref.load
+// CHECK-COUNT-3: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+//         CHECK: %[[RES:.*]], %{{.*}} = gpu.shuffle  idx
+//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<1xf32>
+//         CHECK: scf.if
+//         CHECK:   vector.transfer_write %[[RES_VEC]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_reduction_1d {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_reduction_1d() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
+      %3 = tensor.empty() : tensor<f32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+      return
+    }
+  }
+}
+}
+
+//   CHECK-LABEL: func.func @group_reduction_1d
+// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_elementwise_reduction_elementwise_4d {
+hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+  hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_elementwise_reduction_elementwise_4d() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x4x8x64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x4x8xf32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x8x64xf32>> -> tensor<2x4x8x64xf32>
+      %3 = tensor.empty() : tensor<2x4x8xf32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x8xf32>) -> tensor<2x4x8xf32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
+                           iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2 : tensor<2x4x8x64xf32>) outs(%4 : tensor<2x4x8xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %7 = arith.addf %in, %in : f32
+        %8 = arith.addf %7, %7 : f32
+        %9 = arith.addf %8, %out : f32
+        linalg.yield %9 : f32
+      } -> tensor<2x4x8xf32>
+      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+                           iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<2x4x8xf32>) outs(%3 : tensor<2x4x8xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %7 = math.sqrt %in : f32
+        linalg.yield %7 : f32
+      } -> tensor<2x4x8xf32>
+      flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0], sizes = [2, 4, 8], strides = [1, 1, 1] : tensor<2x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x8xf32>>
+      return
+    }
+  }
+}
+}
+
+//   CHECK-LABEL: func.func @group_elementwise_reduction_elementwise_4d
+// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable private @i4_dequant_matvec {
+  hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @i4_dequant_matvec() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+        %9 = tensor.empty() : tensor<4096xf16>
+        %10 = tensor.empty() : tensor<4096x32x128xf16>
+        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
+        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
+        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+          %14 = arith.extui %in : i4 to i32
+          %15 = arith.uitofp %14 : i32 to f16
+          %16 = arith.subf %15, %in_1 : f16
+          %17 = arith.mulf %16, %in_0 : f16
+          linalg.yield %17 : f16
+        } -> tensor<4096x32x128xf16>
+        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %14 = arith.mulf %in, %in_0 : f16
+          %15 = arith.addf %14, %out : f16
+          linalg.yield %15 : f16
+        } -> tensor<4096xf16>
+        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-LABEL: func.func @i4_dequant_matvec()
+//         CHECK:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16>
+//         CHECK:   %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>)
+//         CHECK:     %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type<storage_buffer>>, vector<1x8xi4>
+//         CHECK:     %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CHECK:     %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CHECK:     %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CHECK:     %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32>
+//         CHECK:     %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16>
+//         CHECK:     %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16>
+//         CHECK:     %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16>
+//         CHECK:     %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16>
+//         CHECK:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16>
+
+//         CHECK:   %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16>
+//         CHECK:   vector.reduction <add>, %[[SCAST]] : vector<8xf16> into f16
+// CHECK-COUNT-6:   gpu.shuffle  xor
+//         CHECK:   scf.if
+//         CHECK:     vector.transfer_write
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
index c46f738d3fa9..fea7846af70b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
@@ -1,84 +1,335 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN:  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
+// RUN:  %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \
+// RUN:  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
+// RUN:  %s | FileCheck %s --check-prefix=CDNA3
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-func.func @softmax() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant -3.40282347E+38 : f32
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %cst_1 = arith.constant 1.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
-  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
-  %3 = tensor.empty() : tensor<12x128x40960xf32>
-  %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
-  flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
-  return
+hal.executable @group_reduction_1d {
+hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_reduction_1d() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
+      %3 = tensor.empty() : tensor<f32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+      return
+    }
+  }
+}
 }
 
-//          CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 32>
-//    CHECK-LABEL: func.func @softmax
-//     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
-// CHECK-COUNT-20:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}}
+//         CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [32, 1, 1] subgroup_size = 32>
+//         CDNA3: func.func @group_reduction_1d()
+//    CDNA3-SAME:    translation_info = #[[$TRANSLATION]]
+// CDNA3-COUNT-5:     gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @group_reduction_1d {
+hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @group_reduction_1d() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant -0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
+      %3 = tensor.empty() : tensor<f32>
+      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
+      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %6 = arith.addf %in, %out : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+      return
+    }
+  }
+}
+}
+
+// On CDNA, we prefer wave64 with subgroup size of 64.
+
+//        CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+//        CHECK: func.func @group_reduction_1d
+// CHECK-COUNT-5:     gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable private @i4_dequant_matvec {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @i4_dequant_matvec() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+        %9 = tensor.empty() : tensor<4096xf16>
+        %10 = tensor.empty() : tensor<4096x32x128xf16>
+        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
+        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
+        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+          %14 = arith.extui %in : i4 to i32
+          %15 = arith.uitofp %14 : i32 to f16
+          %16 = arith.subf %15, %in_1 : f16
+          %17 = arith.mulf %16, %in_0 : f16
+          linalg.yield %17 : f16
+        } -> tensor<4096x32x128xf16>
+        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %14 = arith.mulf %in, %in_0 : f16
+          %15 = arith.addf %14, %out : f16
+          linalg.yield %15 : f16
+        } -> tensor<4096xf16>
+        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        return
+      }
+    }
+  }
+}
+
+//        CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+//        CDNA3: func.func @i4_dequant_matvec()
+//   CDNA3-SAME:    translation_info = #[[$TRANSLATION]]
+//         CDNA3:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16>
+//         CDNA3:   %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>)
+//         CDNA3:     %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type<storage_buffer>>, vector<1x8xi4>
+//         CDNA3:     %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CDNA3:     %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CDNA3:     %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
+//         CDNA3:     %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32>
+//         CDNA3:     %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16>
+//         CDNA3:     %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16>
+//         CDNA3:     %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16>
+//         CDNA3:     %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16>
+//         CDNA3:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16>
+
+//         CDNA3:   %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16>
+//         CDNA3:   vector.reduction <add>, %[[SCAST]] : vector<8xf16> into f16
+// CDNA3-COUNT-6:   gpu.shuffle  xor
+//         CDNA3:   scf.if
+//         CDNA3:     vector.transfer_write
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable private @i4_dequant_matvec {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @i4_dequant_matvec() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+        %9 = tensor.empty() : tensor<4096xf16>
+        %10 = tensor.empty() : tensor<4096x32x128xf16>
+        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
+        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
+        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+          %14 = arith.extui %in : i4 to i32
+          %15 = arith.uitofp %14 : i32 to f16
+          %16 = arith.subf %15, %in_1 : f16
+          %17 = arith.mulf %16, %in_0 : f16
+          linalg.yield %17 : f16
+        } -> tensor<4096x32x128xf16>
+        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %14 = arith.mulf %in, %in_0 : f16
+          %15 = arith.addf %14, %out : f16
+          linalg.yield %15 : f16
+        } -> tensor<4096xf16>
+        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+        return
+      }
+    }
+  }
+}
+
+//      CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+//      CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 
 // -----
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-func.func @softmax() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant -3.40282347E+38 : f32
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %cst_1 = arith.constant 1.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
-  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
-  %3 = tensor.empty() : tensor<12x128x40960xf32>
-  %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
-  flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
-  return
+hal.executable private @matvec_fp16 {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matvec_fp16() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+        %5 = tensor.empty() : tensor<1x32000xf16>
+        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8], [0, 0, 512]]>} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8], [0, 0, 512]]>} {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %8 = arith.mulf %in, %in_0 : f16
+          %9 = arith.addf %out, %8 : f16
+          linalg.yield %9 : f16
+        } -> tensor<1x32000xf16>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        return
+      }
+    }
+  }
 }
 
-// On CDNA, we prefer wave64 with subgroup size 64.
+// This matvec is expected to be reduced multiple rows at a time by a single workgroup.
+// Check that we distribute it across subgroup threads properly. Thread 0 is expected to
+// write 8 results at the end.
+// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
 
-//          CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 64>
-//          CDNA3: func.func @softmax
-//     CDNA3-SAME:      translation_info = #[[$TRANSLATION]]
-// CDNA3-COUNT-20:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}}
+//          CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+//          CHECK: func.func @matvec_fp16()
+//     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
+//      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK-DAG:   %[[C512:.+]] = arith.constant 512 : index
+//      CHECK-DAG:   %[[C4096:.+]] = arith.constant 4096 : index
+//      CHECK-DAG:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16>
+//          CHECK:   scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>)
+//      CHECK-DAG:     %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
+//      CHECK-DAG:     %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
+//          CHECK:     %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16>
+//          CHECK:     %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16>
+
+//          CHECK:   vector.reduction <add>, %{{.+}} : vector<8xf16> into f16
+// CHECK-COUNT-24:   gpu.shuffle xor
+//          CHECK:   scf.if
+//          CHECK:     vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
 
 // -----
 
-#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-func.func @dynamic_softmax() {
-  %c32_i64 = arith.constant 32 : i64
-  %c0 = arith.constant 0 : index
-  %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
-  %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
-  %2 = arith.extui %0 : i32 to i64
-  %3 = arith.extui %1 : i32 to i64
-  %4 = arith.shli %3, %c32_i64 : i64
-  %5 = arith.ori %2, %4 : i64
-  %6 = arith.index_castui %5 : i64 to index
-  %7 = flow.dispatch.workload.ordinal %6, 0 : index
-  %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7}
-  %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
-  %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7} -> tensor<32x?xf16>
-  %11 = tensor.empty(%7) : tensor<32x?xf16>
-  %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
-  flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
-  return
+hal.executable private @matvec_fp16 {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matvec_fp16() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+        %5 = tensor.empty() : tensor<1x32000xf16>
+        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 4], [0, 0, 512]]>} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 4], [0, 0, 512]]>} {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %8 = arith.mulf %in, %in_0 : f16
+          %9 = arith.addf %out, %8 : f16
+          linalg.yield %9 : f16
+        } -> tensor<1x32000xf16>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        return
+      }
+    }
+  }
 }
 
+// Multi-row matvec with wave32.
+// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
+
+//          CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+//          CDNA3: func.func @matvec_fp16()
+//     CDNA3-SAME:     translation_info = #[[$TRANSLATION]]
+//      CDNA3-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//      CDNA3-DAG:   %[[C512:.+]] = arith.constant 512 : index
+//      CDNA3-DAG:   %[[C4096:.+]] = arith.constant 4096 : index
+//      CDNA3-DAG:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16>
+//          CDNA3:   scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>)
+//      CDNA3-DAG:     %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
+//      CDNA3-DAG:     %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
+//          CDNA3:     %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16>
+//          CDNA3:     %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16>
 
-// Finer details of this lowering are captured by the spirv pipeline test. Just
-// verify that warp reduction triggers.
-//    CHECK-LABEL: func.func @dynamic_softmax
-// CHECK-COUNT-10: gpu.shuffle  xor {{.*}} : i32
+//          CDNA3:   vector.reduction <add>, %{{.+}} : vector<8xf16> into f16
+// CDNA3-COUNT-24:   gpu.shuffle xor
+//          CDNA3:   scf.if
+//          CDNA3:     vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir
new file mode 100644
index 000000000000..c46f738d3fa9
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir
@@ -0,0 +1,84 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @softmax() {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -3.40282347E+38 : f32
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
+  %3 = tensor.empty() : tensor<12x128x40960xf32>
+  %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
+  flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+  return
+}
+
+//          CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 32>
+//    CHECK-LABEL: func.func @softmax
+//     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
+// CHECK-COUNT-20:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @softmax() {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -3.40282347E+38 : f32
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
+  %3 = tensor.empty() : tensor<12x128x40960xf32>
+  %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
+  flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+  return
+}
+
+// On CDNA, we prefer wave64 with subgroup size 64.
+
+//          CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 64>
+//          CDNA3: func.func @softmax
+//     CDNA3-SAME:      translation_info = #[[$TRANSLATION]]
+// CDNA3-COUNT-20:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @dynamic_softmax() {
+  %c32_i64 = arith.constant 32 : i64
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+  %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+  %2 = arith.extui %0 : i32 to i64
+  %3 = arith.extui %1 : i32 to i64
+  %4 = arith.shli %3, %c32_i64 : i64
+  %5 = arith.ori %2, %4 : i64
+  %6 = arith.index_castui %5 : i64 to index
+  %7 = flow.dispatch.workload.ordinal %6, 0 : index
+  %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7}
+  %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
+  %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7} -> tensor<32x?xf16>
+  %11 = tensor.empty(%7) : tensor<32x?xf16>
+  %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
+  flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
+  return
+}
+
+
+// Finer details of this lowering are captured by the spirv pipeline test. Just
+// verify that warp reduction triggers.
+//    CHECK-LABEL: func.func @dynamic_softmax
+// CHECK-COUNT-10: gpu.shuffle  xor {{.*}} : i32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir
deleted file mode 100644
index e3b16eb22cbc..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir
+++ /dev/null
@@ -1,565 +0,0 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @small_reduction {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @small_reduction ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @small_reduction() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x13xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024xf32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 13], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x13xf32>> -> tensor<1024x13xf32>
-      %3 = tensor.empty() : tensor<1024xf32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1024xf32>) -> tensor<1024xf32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<1024x13xf32>) outs(%4 : tensor<1024xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<1024xf32>
-      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024xf32>>
-      return
-    }
-  }
-}
-}
-
-// Small reduction computes the whole reduction on a single thread.
-//   CHECK-LABEL: func.func @small_reduction
-//     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-//     CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
-//     CHECK-NOT:   memref.alloc()
-//         CHECK: gpu.thread_id  x
-//         CHECK: %[[v:.*]] = scf.for %{{.*}} = %[[C0]] to %[[C12]] step %[[C4]] {{.*}} -> (vector<1xf32>) {
-//         CHECK:   vector.transfer_read {{.*}}: memref<1024x13xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//         CHECK:   vector.multi_reduction <add>, %{{.*}} : vector<1x4xf32> to vector<1xf32>
-//         CHECK: }
-//     CHECK-NOT: gpu.barrier
-//         CHECK: %[[r:.*]] = vector.transfer_read {{.*}}: memref<1024x13xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1xf32>
-//         CHECK: %[[r1:.*]] = vector.shape_cast %[[r:.*]] : vector<1x1xf32> to vector<1xf32>
-//         CHECK: arith.addf %[[v]], %[[r1]] : vector<1xf32>
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_reduction ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x64xf32>> -> tensor<8x64xf32>
-      %3 = tensor.empty() : tensor<8xf32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<8xf32>
-      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
-      return
-    }
-  }
-}
-}
-
-//   CHECK-LABEL: func.func @group_reduction
-//     CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG:   %[[C32:.*]] = arith.constant 32 : index
-//     CHECK-DAG:   %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-DAG:   %[[TIDX:.+]] = gpu.thread_id  x
-
-// No allocation created for the per thread data.
-//     CHECK-NOT:   memref.alloc()
-
-// Fusion occurred, no barrier before the loop
-//     CHECK-NOT: gpu.barrier
-// Local per-thread scf.for-based reduction.
-//         CHECK: %[[v:.*]] = scf.for {{.*}} -> (vector<1xf32>) {
-//         CHECK:   vector.transfer_read {{.*}} memref<8x64xf32, #hal.descriptor_type<storage_buffer>>, vector<1xf32>
-//         CHECK:   arith.addf {{.*}} : vector<1xf32>
-// No barrier within the loop.
-//     CHECK-NOT:   gpu.barrier
-//         CHECK:   }
-// No store after the loop, the data are kept in register.
-//     CHECK-NOT:   vector.transfer_write
-// Barrier after the loop.
-//         CHECK:   gpu.barrier
-
-//     CHECK-NOT:   vector.transfer_read
-// CHECK-COUNT-5:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK:   %[[RES:.*]] = arith.addf %{{.*}} : f32
-//         CHECK:   %[[RES_VEC:.*]] = vector.broadcast %{{.*}} : f32 to vector<1xf32>
-//         CHECK:   %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK:   scf.if %[[CONDXIS0]]
-//         CHECK:     vector.transfer_write %[[RES_VEC]]
-//         CHECK:   gpu.barrier
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_elementwise_reduction_elementwise {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_elementwise_reduction_elementwise() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x64xf32>> -> tensor<8x64xf32>
-      %3 = tensor.empty() : tensor<8xf32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %7 = arith.addf %in, %in : f32
-        %8 = arith.addf %7, %7 : f32
-        %9 = arith.addf %8, %out : f32
-        linalg.yield %9 : f32
-      } -> tensor<8xf32>
-      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<8xf32>) outs(%3 : tensor<8xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %7 = math.sqrt %in : f32
-        linalg.yield %7 : f32
-      } -> tensor<8xf32>
-      flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
-      return
-    }
-  }
-}
-}
-
-//   CHECK-LABEL: func.func @group_elementwise_reduction_elementwise
-//     CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG:   %[[C32:.*]] = arith.constant 32 : index
-//     CHECK-DAG:   %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-NOT:   memref.alloc()
-
-// Fusion occurred, no barrier before the loop
-//     CHECK-NOT: gpu.barrier
-// Local per-thread scf.for-based reduction.
-//         CHECK: %[[TIDX:.+]] = gpu.thread_id  x
-//         CHECK: %[[v:.*]] = scf.for {{.*}} -> (vector<1xf32>)
-//         CHECK:   vector.transfer_read {{.*}} vector<1xf32>
-//         CHECK:   arith.addf{{.*}} : vector<1xf32>
-//         CHECK:   arith.addf{{.*}} : vector<1xf32>
-//         CHECK:   arith.addf{{.*}} : vector<1xf32>
-// No barrier within the loop
-//     CHECK-NOT:   gpu.barrier
-//         CHECK: }
-//     CHECK-NOT: vector.transfer_write
-// Barrier after the loop
-//         CHECK:   gpu.barrier
-
-//     CHECK-NOT:   vector.transfer_read
-// CHECK-COUNT-5:   gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK:   %[[PARTIAL:.*]] = arith.addf %{{.*}}
-//         CHECK:   %[[BROADCAST:.*]], %{{.*}} = gpu.shuffle  idx %[[PARTIAL]]
-//         CHECK:   %[[RES_VEC:.*]] = vector.broadcast %[[BROADCAST]] : f32 to vector<1xf32>
-//         CHECK:   %[[SQRT_VEC:.*]] = math.sqrt %[[RES_VEC]] : vector<1xf32>
-//         CHECK:   %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK:   scf.if %[[CONDXIS0]]
-//         CHECK:     vector.transfer_write %[[SQRT_VEC]], {{.*}} : vector<1xf32>, memref<8xf32, #hal.descriptor_type<storage_buffer>>
-//         CHECK:   gpu.barrier
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction_larger {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_reduction_larger ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction_larger() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x1024xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<33xf32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x1024xf32>> -> tensor<33x1024xf32>
-      %3 = tensor.empty() : tensor<33xf32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<33xf32>) -> tensor<33xf32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<33x1024xf32>) outs(%4 : tensor<33xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<33xf32>
-      flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [33], strides = [1] : tensor<33xf32> -> !flow.dispatch.tensor<writeonly:tensor<33xf32>>
-      return
-    }
-  }
-}
-}
-
-//   CHECK-LABEL: func.func @group_reduction_larger
-//     CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG:   %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-NOT:   memref.alloc()
-
-// Fusion occurred, no barrier before the loop
-//     CHECK-NOT: gpu.barrier
-// Local per-thread scf.for-based reduction.
-//         CHECK: %[[TIDX:.+]] = gpu.thread_id  x
-//         CHECK: %[[TIDX_TIMES_4:.]] = affine.apply{{.*}}[%[[TIDX]]]
-//         CHECK: scf.for {{.*}} -> (vector<1xf32>) {
-//         CHECK:   vector.transfer_read {{.*}} vector<4xf32>
-//         CHECK:   vector.reduction <add>{{.*}} : vector<4xf32> into f32
-//         CHECK:   vector.broadcast {{.*}} : f32 to vector<1xf32>
-// No barrier within the loop
-//     CHECK-NOT:   gpu.barrier
-//         CHECK: }
-//     CHECK-NOT: vector.transfer_write
-//     CHECK-NOT: vector.transfer_read
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-//         CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8xf32, #gpu.address_space<workgroup>>
-//         CHECK: scf.if
-//         CHECK:   memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<8xf32, #gpu.address_space<workgroup>>
-//         CHECK: }
-//         CHECK: arith.minui
-//         CHECK: memref.load
-// CHECK-COUNT-3: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-//         CHECK: %[[RES:.*]], %{{.*}} = gpu.shuffle  idx
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<1xf32>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]]
-//         CHECK: gpu.barrier
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction_1d {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction_1d() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
-      %3 = tensor.empty() : tensor<f32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<f32>
-      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
-      return
-    }
-  }
-}
-}
-
-//   CHECK-LABEL: func.func @group_reduction_1d
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_elementwise_reduction_elementwise_4d {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_elementwise_reduction_elementwise_4d() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x4x8x64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x4x8xf32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x8x64xf32>> -> tensor<2x4x8x64xf32>
-      %3 = tensor.empty() : tensor<2x4x8xf32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x8xf32>) -> tensor<2x4x8xf32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
-                           iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2 : tensor<2x4x8x64xf32>) outs(%4 : tensor<2x4x8xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %7 = arith.addf %in, %in : f32
-        %8 = arith.addf %7, %7 : f32
-        %9 = arith.addf %8, %out : f32
-        linalg.yield %9 : f32
-      } -> tensor<2x4x8xf32>
-      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                           iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<2x4x8xf32>) outs(%3 : tensor<2x4x8xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %7 = math.sqrt %in : f32
-        linalg.yield %7 : f32
-      } -> tensor<2x4x8xf32>
-      flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0], sizes = [2, 4, 8], strides = [1, 1, 1] : tensor<2x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x8xf32>>
-      return
-    }
-  }
-}
-}
-
-//   CHECK-LABEL: func.func @group_elementwise_reduction_elementwise_4d
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction_i8_12345 {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-  hal.executable.export public @group_reduction_i8_12345 ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction_i8_12345() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant 0 : i8
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x12345xi8>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x12345xi8>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 12345], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x12345xi8>> -> tensor<8x12345xi8>
-      %3 = tensor.empty() : tensor<8x12345xi8>
-      %4 = tensor.empty() : tensor<8xi8>
-      %5 = linalg.fill ins(%cst : i8) outs(%4 : tensor<8xi8>) -> tensor<8xi8>
-      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
-                           iterator_types = ["parallel", "reduction"]}
-        ins(%2 : tensor<8x12345xi8>)
-       outs(%5 : tensor<8xi8>) {
-      ^bb0(%in: i8, %out: i8):
-        %6 = arith.addi %in, %out : i8
-        linalg.yield %6 : i8
-      } -> tensor<8xi8>
-      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-                           iterator_types = ["parallel", "parallel"]}
-        ins(%2, %6 : tensor<8x12345xi8>, tensor<8xi8>)
-       outs(%3 : tensor<8x12345xi8>) {
-      ^bb0(%in: i8, %in_0: i8, %out: i8):
-        %8 = arith.divui %in, %in_0 : i8
-        linalg.yield %8 : i8
-      } -> tensor<8x12345xi8>
-      flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [8, 12345], strides = [1, 1] : tensor<8x12345xi8> -> !flow.dispatch.tensor<writeonly:tensor<8x12345xi8>>
-      return
-    }
-  }
-}
-}
-
-
-//   CHECK-LABEL: func.func @group_reduction_i8_12345
-//     CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG:   %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-
-//     CHECK-DAG: %[[ALLOC0:.+]] = memref.alloc() {alignment = 64 : i64} : memref<1xi8, #gpu.address_space<workgroup>>
-// Local per-thread scf.for-based reduction.
-//         CHECK: %[[TIDX:.+]] = gpu.thread_id  x
-//         CHECK: scf.for {{.*}} -> (vector<1xi8>)
-//         CHECK:   vector.transfer_read {{.*}} vector<1xi8>
-//         CHECK:   arith.addi{{.*}} : vector<1xi8>
-//     CHECK-NOT:   vector.transfer_write
-// No barrier within the loop
-//     CHECK-NOT:   gpu.barrier
-//         CHECK: }
-//     CHECK-NOT: vector.transfer_write
-// Barrier after the loop
-//         CHECK: gpu.barrier
-
-//     CHECK-NOT: vector.transfer_read
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}arith.trunci{{.*}}{{[[:space:]].*}}arith.addi{{.*}}i8
-//         CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32xi8, #gpu.address_space<workgroup>>
-//         CHECK: scf.if
-//         CHECK:   memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<32xi8, #gpu.address_space<workgroup>>
-//         CHECK: }
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}arith.trunci{{.*}}{{[[:space:]].*}}arith.addi{{.*}}i8
-
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %{{.+}} : i8 to vector<1xi8>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]], %[[ALLOC0]][%[[C0]]] {in_bounds = [true]} : vector<1xi8>, memref<1xi8, #gpu.address_space<workgroup>>
-
-//         CHECK:   gpu.barrier
-//         CHECK:   arith.divui {{.*}} vector<8xi8>
-//         CHECK:   arith.divui {{.*}} i8
-//         CHECK:   gpu.barrier
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-hal.executable @reduction_2d_trailing_elementwise_static_dispatch_0 {
-  hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
-    hal.executable.export public @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32 ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f32
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10xf32>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x10xf32>>
-        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10xf32>> -> tensor<128x10xf32>
-        %3 = tensor.empty() : tensor<128x10xf32>
-        %4 = tensor.empty() : tensor<128xf32>
-        %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<128xf32>) -> tensor<128xf32>
-        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<128x10xf32>) outs(%5 : tensor<128xf32>) {
-        ^bb0(%in: f32, %out: f32):
-          %8 = arith.addf %in, %out : f32
-          linalg.yield %8 : f32
-        } -> tensor<128xf32>
-        %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<128x10xf32>, tensor<128xf32>) outs(%3 : tensor<128x10xf32>) {
-        ^bb0(%in: f32, %in_0: f32, %out: f32):
-          %8 = arith.divf %in, %in_0 : f32
-          linalg.yield %8 : f32
-        } -> tensor<128x10xf32>
-        flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [128, 10], strides = [1, 1] : tensor<128x10xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10xf32>>
-        return
-      }
-    }
-  }
-}
-
-// CHECK-LABEL: func.func @reduction_2d_trailing_elementwise_static_dispatch_0
-//   CHECK-NOT: gpu.shuffle
-//
-// Loop vector<4> + tail vector<2> reduction part run sequentially.
-//   CHECK: scf.for {{.*}} -> (vector<1xf32>) {
-//   CHECK:   vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//   CHECK:   vector.multi_reduction <add>, {{.*}} [1] : vector<1x4xf32> to vector<1xf32>
-//   CHECK:   scf.yield %{{.*}} : vector<1xf32>
-//   CHECK: }
-//   CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type<storage_buffer>>, vector<1x2xf32>
-//   CHECK: vector.multi_reduction <add>, {{.*}} [1] : vector<1x2xf32> to vector<1xf32>
-//   CHECK: vector.broadcast {{.*}} : vector<1xf32> to vector<1x4xf32>
-//
-// Loop vector<4> + tail vector<2> writeback part run sequentially.
-//   CHECK: scf.for {{.*}} {
-//   CHECK:   vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//   CHECK:   arith.divf {{.*}} : vector<1x4xf32>
-//   CHECK:   vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x8xf32, strided<[10, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
-//   CHECK: }
-//   CHECK: vector.broadcast {{.*}} : vector<1xf32> to vector<1x2xf32>
-//   CHECK: arith.divf {{.*}} : vector<1x2xf32>
-//   CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x2xf32>, memref<1x10xf32, strided<[10, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
-//   CHECK: gpu.barrier
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable private @i4_dequant_matvec {
-  hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @i4_dequant_matvec() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
-        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
-        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
-        %9 = tensor.empty() : tensor<4096xf16>
-        %10 = tensor.empty() : tensor<4096x32x128xf16>
-        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
-        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
-        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
-          %14 = arith.extui %in : i4 to i32
-          %15 = arith.uitofp %14 : i32 to f16
-          %16 = arith.subf %15, %in_1 : f16
-          %17 = arith.mulf %16, %in_0 : f16
-          linalg.yield %17 : f16
-        } -> tensor<4096x32x128xf16>
-        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
-        ^bb0(%in: f16, %in_0: f16, %out: f16):
-          %14 = arith.mulf %in, %in_0 : f16
-          %15 = arith.addf %14, %out : f16
-          linalg.yield %15 : f16
-        } -> tensor<4096xf16>
-        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        return
-      }
-    }
-  }
-}
-
-//   CHECK-LABEL: func.func @i4_dequant_matvec()
-//         CHECK:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16>
-//         CHECK:   %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>)
-//         CHECK:     %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type<storage_buffer>>, vector<1x8xi4>
-//         CHECK:     %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CHECK:     %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CHECK:     %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CHECK:     %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32>
-//         CHECK:     %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16>
-//         CHECK:     %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16>
-//         CHECK:     %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16>
-//         CHECK:     %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16>
-//         CHECK:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16>
-
-//         CHECK:   %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16>
-//         CHECK:   vector.reduction <add>, %[[SCAST]] : vector<8xf16> into f16
-// CHECK-COUNT-6:   gpu.shuffle  xor
-//         CHECK:   scf.if
-//         CHECK:     vector.transfer_write
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir
deleted file mode 100644
index fea7846af70b..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir
+++ /dev/null
@@ -1,335 +0,0 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
-// RUN:  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
-// RUN:  %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \
-// RUN:  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
-// RUN:  %s | FileCheck %s --check-prefix=CDNA3
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction_1d {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction_1d() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
-      %3 = tensor.empty() : tensor<f32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<f32>
-      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
-      return
-    }
-  }
-}
-}
-
-//         CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [32, 1, 1] subgroup_size = 32>
-//         CDNA3: func.func @group_reduction_1d()
-//    CDNA3-SAME:    translation_info = #[[$TRANSLATION]]
-// CDNA3-COUNT-5:     gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @group_reduction_1d {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-  hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) {
-  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-    hal.return %x, %y, %z : index, index, index
-  }
-  builtin.module {
-    func.func @group_reduction_1d() {
-      %c0 = arith.constant 0 : index
-      %cst = arith.constant -0.000000e+00 : f32
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64xf32>>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
-      %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor<readonly:tensor<64xf32>> -> tensor<64xf32>
-      %3 = tensor.empty() : tensor<f32>
-      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
-      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor<f32>) {
-      ^bb0(%in: f32, %out: f32):
-        %6 = arith.addf %in, %out : f32
-        linalg.yield %6 : f32
-      } -> tensor<f32>
-      flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
-      return
-    }
-  }
-}
-}
-
-// On CDNA, we prefer wave64 with subgroup size of 64.
-
-//        CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
-//        CHECK: func.func @group_reduction_1d
-// CHECK-COUNT-5:     gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable private @i4_dequant_matvec {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @i4_dequant_matvec() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
-        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
-        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
-        %9 = tensor.empty() : tensor<4096xf16>
-        %10 = tensor.empty() : tensor<4096x32x128xf16>
-        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
-        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
-        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
-          %14 = arith.extui %in : i4 to i32
-          %15 = arith.uitofp %14 : i32 to f16
-          %16 = arith.subf %15, %in_1 : f16
-          %17 = arith.mulf %16, %in_0 : f16
-          linalg.yield %17 : f16
-        } -> tensor<4096x32x128xf16>
-        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
-        ^bb0(%in: f16, %in_0: f16, %out: f16):
-          %14 = arith.mulf %in, %in_0 : f16
-          %15 = arith.addf %14, %out : f16
-          linalg.yield %15 : f16
-        } -> tensor<4096xf16>
-        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        return
-      }
-    }
-  }
-}
-
-//        CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
-//        CDNA3: func.func @i4_dequant_matvec()
-//   CDNA3-SAME:    translation_info = #[[$TRANSLATION]]
-//         CDNA3:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16>
-//         CDNA3:   %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>)
-//         CDNA3:     %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type<storage_buffer>>, vector<1x8xi4>
-//         CDNA3:     %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CDNA3:     %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CDNA3:     %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
-//         CDNA3:     %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32>
-//         CDNA3:     %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16>
-//         CDNA3:     %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16>
-//         CDNA3:     %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16>
-//         CDNA3:     %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16>
-//         CDNA3:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16>
-
-//         CDNA3:   %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16>
-//         CDNA3:   vector.reduction <add>, %[[SCAST]] : vector<8xf16> into f16
-// CDNA3-COUNT-6:   gpu.shuffle  xor
-//         CDNA3:   scf.if
-//         CDNA3:     vector.transfer_write
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable private @i4_dequant_matvec {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-    hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @i4_dequant_matvec() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
-        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
-        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
-        %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
-        %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
-        %9 = tensor.empty() : tensor<4096xf16>
-        %10 = tensor.empty() : tensor<4096x32x128xf16>
-        %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
-        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
-        ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
-          %14 = arith.extui %in : i4 to i32
-          %15 = arith.uitofp %14 : i32 to f16
-          %16 = arith.subf %15, %in_1 : f16
-          %17 = arith.mulf %16, %in_0 : f16
-          linalg.yield %17 : f16
-        } -> tensor<4096x32x128xf16>
-        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
-        ^bb0(%in: f16, %in_0: f16, %out: f16):
-          %14 = arith.mulf %in, %in_0 : f16
-          %15 = arith.addf %14, %out : f16
-          linalg.yield %15 : f16
-        } -> tensor<4096xf16>
-        flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
-        return
-      }
-    }
-  }
-}
-
-//      CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
-//      CHECK: func.func @i4_dequant_matvec()
-// CHECK-SAME:     translation_info = #[[$TRANSLATION]]
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable private @matvec_fp16 {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @matvec_fp16() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
-        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
-        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
-        %5 = tensor.empty() : tensor<1x32000xf16>
-        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8], [0, 0, 512]]>} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
-        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8], [0, 0, 512]]>} {
-        ^bb0(%in: f16, %in_0: f16, %out: f16):
-          %8 = arith.mulf %in, %in_0 : f16
-          %9 = arith.addf %out, %8 : f16
-          linalg.yield %9 : f16
-        } -> tensor<1x32000xf16>
-        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
-        return
-      }
-    }
-  }
-}
-
-// This matvec is expected to be reduced multiple rows at a time by a single workgroup.
-// Check that we distribute it across subgroup threads properly. Thread 0 is expected to
-// write 8 results at the end.
-// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
-
-//          CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
-//          CHECK: func.func @matvec_fp16()
-//     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
-//      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//      CHECK-DAG:   %[[C512:.+]] = arith.constant 512 : index
-//      CHECK-DAG:   %[[C4096:.+]] = arith.constant 4096 : index
-//      CHECK-DAG:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16>
-//          CHECK:   scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>)
-//      CHECK-DAG:     %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
-//      CHECK-DAG:     %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
-//          CHECK:     %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16>
-//          CHECK:     %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16>
-
-//          CHECK:   vector.reduction <add>, %{{.+}} : vector<8xf16> into f16
-// CHECK-COUNT-24:   gpu.shuffle xor
-//          CHECK:   scf.if
-//          CHECK:     vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable private @matvec_fp16 {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @matvec_fp16() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
-        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
-        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
-        %5 = tensor.empty() : tensor<1x32000xf16>
-        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 4], [0, 0, 512]]>} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
-        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 4], [0, 0, 512]]>} {
-        ^bb0(%in: f16, %in_0: f16, %out: f16):
-          %8 = arith.mulf %in, %in_0 : f16
-          %9 = arith.addf %out, %8 : f16
-          linalg.yield %9 : f16
-        } -> tensor<1x32000xf16>
-        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
-        return
-      }
-    }
-  }
-}
-
-// Multi-row matvec with wave32.
-// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
-
-//          CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
-//          CDNA3: func.func @matvec_fp16()
-//     CDNA3-SAME:     translation_info = #[[$TRANSLATION]]
-//      CDNA3-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//      CDNA3-DAG:   %[[C512:.+]] = arith.constant 512 : index
-//      CDNA3-DAG:   %[[C4096:.+]] = arith.constant 4096 : index
-//      CDNA3-DAG:   %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16>
-//          CDNA3:   scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>)
-//      CDNA3-DAG:     %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
-//      CDNA3-DAG:     %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8x8xf16>
-//          CDNA3:     %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16>
-//          CDNA3:     %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16>
-
-//          CDNA3:   vector.reduction <add>, %{{.+}} : vector<8xf16> into f16
-// CDNA3-COUNT-24:   gpu.shuffle xor
-//          CDNA3:   scf.if
-//          CDNA3:     vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
deleted file mode 100644
index 33d5f29e93be..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
+++ /dev/null
@@ -1,191 +0,0 @@
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:     --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy |\
-// RUN:   FileCheck %s --check-prefixes=CHECK,DEFAULT
-
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:     --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy \
-// RUN: -td-matmul-strategy-blk-sizes=128,64,32,2 \
-// RUN: -td-matmul-strategy-reduc-size=8 \
-// RUN: -td-matmul-strategy-num-threads=32,4,1 \
-// RUN: -td-matmul-strategy-num-warps=1,4,1 \
-// RUN: -td-matmul-strategy-use-async-copies=true \
-// RUN: -td-matmul-strategy-pipeline-depth=3 \
-// RUN: -td-matmul-strategy-use-mma-sync=false \
-// RUN: -td-matmul-strategy-use-fma=true \
-// RUN:   | FileCheck %s --check-prefixes=CHECK,OPTIONS
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>> -> tensor<128x80x32xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>> -> tensor<128x32x320xf32>
-  %5 = tensor.empty() : tensor<128x80x320xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32>
-  %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) {
-  ^bb0(%in: f32, %in_0: f32, %out: f32):
-    %8 = arith.mulf %in, %in_0 : f32
-    %9 = arith.addf %out, %8 : f32
-    linalg.yield %9 : f32
-  } -> tensor<128x80x320xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
-  return
-}
-
-// CHECK: transform.named_sequence
-// CHECK:   transform.iree.register_match_callbacks
-// CHECK:   %[[MATCH:.+]]:2 = transform.iree.match_callback failures(propagate) "batch_matmul"
-// CHECK:   %[[TILED:.+]], %[[FORALL:.+]] = transform.structured.tile_using_forall %[[MATCH]]#1
-// DEFAULT:   tile_sizes [64, 64, 1](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
-// OPTIONS:   tile_sizes [128, 64, 32](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   %[[FUSED:.+]], %[[CONTAINING:.+]] = transform.structured.fuse_into_containing_op %[[MATCH]]#0 into %[[FORALL]]
-// CHECK:   transform.iree.populate_workgroup_count_region_using_num_threads_slice %[[FORALL]]
-// CHECK:   %[[TILED_LINALG:.+]], %[[LOOPS:.+]] = transform.structured.tile_using_for %tiled_op
-// DEFAULT:   [0, 0, 0, 16]
-// OPTIONS:   [0, 0, 0, 8]
-// CHECK:   %[[PADDED:.+]], %{{.*}}, %{{.+}} = transform.structured.pad %tiled_linalg_op pad_to_multiple_of [1, 1, 1, 1]
-// CHECK:     nofold_flags = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3]
-// CHECK:     padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK:   %[[V3:.+]] = transform.get_producer_of_operand %[[PADDED]][2]
-// CHECK:   transform.structured.hoist_pad %{{.*}} by 1 loops
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   %[[FILL:.+]] = transform.structured.match ops{["linalg.fill"]}
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.structured.match ops{["tensor.parallel_insert_slice"]}
-// CHECK:   transform.structured.insert_slice_to_copy
-// CHECK:   %[[LHS:.+]] = transform.get_producer_of_operand %[[PADDED]][0]
-// CHECK:   %[[RHS:.+]] = transform.get_producer_of_operand %[[PADDED]][1]
-// CHECK:   %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]
-
-// CHECK:   transform.structured.tile_using_forall %[[LHS]]
-// DEFAULT:  num_threads [1, 32, 4](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// OPTIONS:  num_threads [1, 64, 2](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.structured.match ops{["scf.if"]}
-// CHECK:   transform.scf.take_assumed_branch %{{.*}} take_else_branch
-
-// CHECK:   transform.structured.tile_using_forall %[[RHS_DPS]]
-// DEFAULT:  num_threads [8, 16, 1](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// OPTIONS:  num_threads [2, 8, 8](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-
-// CHECK:   transform.structured.tile_using_forall
-// DEFAULT:  num_threads [2, 64, 1](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// OPTIONS:  num_threads [1, 16, 8](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-
-// CHECK:   transform.structured.tile_using_forall
-// DEFAULT:  num_threads [1, 2, 64](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
-// OPTIONS:  num_threads [1, 4, 32](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-
-// CHECK:   %tiled_op_8, %forall_op_9 = transform.structured.tile_using_forall %[[FILL]]
-// DEFAULT:   num_threads [1, 2, 64](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
-// OPTIONS:   num_threads [1, 4, 32](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-
-// CHECK:   transform.structured.vectorize
-// DEFAULT:   vector_sizes [64, 2, 4]
-// OPTIONS:   vector_sizes [128, 1, 4]
-// CHECK:   transform.structured.vectorize
-// DEFAULT:   vector_sizes [32, 1, 1]
-// OPTIONS:   vector_sizes [128, 4, 4]
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.vector.lower_masked_transfers
-// CHECK:   transform.structured.vectorize_children_and_apply_patterns
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.canonicalization
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.iree.eliminate_empty_tensors
-
-// CHECK:   transform.iree.bufferize {target_gpu}
-// CHECK:   transform.memref.erase_dead_alloc_and_stores
-// CHECK:   transform.iree.forall_to_workgroup
-// CHECK:   transform.iree.map_nested_forall_to_gpu_threads
-// DEFAULT:  workgroup_dims = [64, 2, 1]
-// OPTIONS:  workgroup_dims = [32, 4, 1]
-// CHECK:   transform.iree.eliminate_gpu_barriers
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.iree.hoist_static_alloc
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.memref.fold_memref_alias_ops
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.memref.extract_address_computations
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.linalg.tiling_canonicalization
-// CHECK:     transform.apply_patterns.iree.fold_fill_into_pad
-// CHECK:     transform.apply_patterns.scf.for_loop_canonicalization
-// CHECK:     transform.apply_patterns.canonicalization
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.iree.synchronize_loop
-// CHECK:   transform.structured.hoist_redundant_vector_transfers
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.memref.erase_dead_alloc_and_stores
-// CHECK:   transform.iree.eliminate_gpu_barriers
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.memref.fold_memref_alias_ops
-
-// CHECK:   transform.memref.multibuffer
-// DEFAULT:   factor = 2
-// OPTIONS:   factor = 3
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.vector.transfer_to_scf   full_unroll = true
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.iree.create_async_groups
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
-// CHECK:   transform.iree.pipeline_shared_memory_copies
-// DEFAULT:   depth = 2
-// OPTIONS:   depth = 3
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.vector.lower_masks
-// CHECK:   apply_patterns
-// CHECK:     transform.apply_patterns.vector.materialize_masks
-// CHECK:   apply_patterns
-// CHECK:   transform.iree.apply_licm
-// CHECK:   transform.apply_cse to
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
deleted file mode 100644
index 6ab8221d5351..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
+++ /dev/null
@@ -1,130 +0,0 @@
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit= --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:  --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @nchw_convolution() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>> -> tensor<8x128x258x258xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>> -> tensor<256x128x3x3xf32>
-  %5 = tensor.empty() : tensor<8x256x256x256xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
-  %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
-  return
-}
-
-// CHECK-LABEL: func @nchw_convolution
-
-// CHECK: transform.named_sequence
-// CHECK: transform.iree.match_callback failures(propagate) "convolution"
-// CHECK: transform.structured.convert_conv2d_to_img2col
-// CHECK: transform.get_producer_of_operand %{{.*}}[0]
-// CHECK: transform.apply_patterns.iree.bubble_collapse
-// CHECK: transform.structured.tile_using_forall %{{.*}}   tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
-// CHECK: transform.structured.fuse_into_containing_op
-// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice %{{.*}}
-// CHECK: transform.structured.match ops{["linalg.fill"]}
-// CHECK: transform.structured.fuse_into_containing_op
-// CHECK: transform.structured.fuse_into_containing_op
-// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 0, 16]
-// CHECK: transform.structured.fuse_into_containing_op
-// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 0, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: transform.structured.match ops{["linalg.fill"]}
-// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
-// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
-// CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]]
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %[[RHS]]   num_threads [1, 4, 32](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [1, 2, 2](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [1, 2, 2](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-// CHECK: transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-// CHECK: transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-// CHECK: transform.structured.vectorize_children_and_apply_patterns %{{.*}} {vectorize_nd_extract}
-// CHECK: transform.iree.eliminate_empty_tensors
-// CHECK: transform.iree.bufferize {target_gpu}
-// CHECK: transform.memref.erase_dead_alloc_and_stores
-// CHECK: transform.iree.forall_to_workgroup
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
-// CHECK: transform.iree.hoist_static_alloc %{{.*}}
-// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops
-// CHECK: transform.apply_patterns.memref.extract_address_computations
-// CHECK: transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
-// CHECK: transform.structured.hoist_redundant_vector_transfers
-// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync}
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @nhwc_convolution() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>> -> tensor<8x258x258x128xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>> -> tensor<3x3x128x256xf32>
-  %5 = tensor.empty() : tensor<8x256x256x256xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
-  %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
-  return
-}
-
-// CHECK-LABEL: func @nhwc_convolution
-
-// CHECK: transform.named_sequence
-// CHECK: transform.structured.tile_using_forall %{{.*}}   tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
-// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1, 1] {copy_back_op = "none", nofold_flags = [0, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
-// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
-// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]]
-// CHECK: transform.structured.tile_using_forall %[[LHS]]   num_threads [1, 32, 4](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [1, 2, 2](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [1, 2, 2](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @unaligned_convolution() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>> -> tensor<8x258x258x132xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>> -> tensor<3x3x132x264xf32>
-  %5 = tensor.empty() : tensor<8x256x256x264xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
-  %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
-  return
-}
-
-// CHECK:       #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [2, 4, 4]>
-// CHECK-LABEL: func @unaligned_convolution
-
-// Currently padding on the img2col op is not supported so bail out for unaligned.
-// CHECK-NOT: transform.named_sequence
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
deleted file mode 100644
index 8943709e1c13..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
+++ /dev/null
@@ -1,522 +0,0 @@
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:   --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul | FileCheck %s
-
-// Check that setting the command line options affect the transform
-// strategy as expected.
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN: --iree-gpu-test-target=sm_80 \
-// RUN: -td-matmul-strategy-blk-sizes=256,64,1 \
-// RUN: -td-matmul-strategy-reduc-size=8 \
-// RUN: -td-matmul-strategy-num-threads=32,4,1 \
-// RUN: -td-matmul-strategy-num-warps=1,4,1 \
-// RUN: -td-matmul-strategy-use-async-copies=true \
-// RUN: -td-matmul-strategy-use-mma-sync=true \
-// RUN: -td-matmul-strategy-pipeline-depth=5 \
-// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s
-
-// Check that various more exotic strategies apply properly e2e but without otherwise checking their content.
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN: --iree-gpu-test-target=sm_80 \
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \
-// RUN: -td-matmul-strategy-blk-sizes=16,16,1 \
-// RUN: -td-matmul-strategy-reduc-size=16 \
-// RUN: -td-matmul-strategy-num-threads=32,1,1 \
-// RUN: -td-matmul-strategy-num-warps=1,1,1 \
-// RUN: -td-matmul-strategy-use-async-copies=true \
-// RUN: -td-matmul-strategy-use-mma-sync=true \
-// RUN: -td-matmul-strategy-pipeline-depth=9 \
-// RUN: | FileCheck --check-prefix=WITH_OPTIONS_2 %s
-
-// Check that various more exotic strategies apply properly e2e but without otherwise checking their content.
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN: --iree-gpu-test-target=sm_80 \
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \
-// RUN: -td-matmul-strategy-blk-sizes=128,64,1 \
-// RUN: -td-matmul-strategy-reduc-size=16 \
-// RUN: -td-matmul-strategy-num-threads=128,2,1 \
-// RUN: -td-matmul-strategy-num-warps=1,8,1 \
-// RUN: -td-matmul-strategy-use-async-copies=true \
-// RUN: -td-matmul-strategy-use-mma-sync=true \
-// RUN: -td-matmul-strategy-pipeline-depth=3 \
-// RUN: | FileCheck --check-prefix=WITH_OPTIONS_3 %s
-
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:   --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-small-matmul | FileCheck --check-prefix=SMALL %s
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul_1() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
-  %5 = tensor.empty() : tensor<2052x2052xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
-  return
-}
-
-// CHECK-LABEL: func @matmul_1
-
-// CHECK: transform.named_sequence
-// CHECK: transform.iree.match_callback failures(propagate) "matmul"
-// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
-// CHECK: transform.structured.fuse_into_containing_op
-// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice
-// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16]
-// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: transform.structured.hoist_pad %{{.}} by 1 loops
-// CHECK: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:   transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> ()
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> ()
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4]
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4]
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [32, 4]
-// CHECK: transform.apply_patterns.vector.lower_masked_transfers
-// CHECK: transform.structured.vectorize_children_and_apply_patterns %{{.*}}
-// CHECK: transform.iree.eliminate_empty_tensors %{{.*}}
-// CHECK: transform.iree.bufferize {target_gpu} %{{.*}}
-// CHECK: transform.iree.forall_to_workgroup %{{.*}}
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
-// CHECK: transform.iree.hoist_static_alloc %{{.*}}
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK:   transform.apply_patterns.memref.fold_memref_alias_ops
-// CHECK: } : !transform.any_op
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK:   transform.apply_patterns.memref.extract_address_computations
-// CHECK: } : !transform.any_op
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK:   transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
-// CHECK: } : !transform.any_op
-// CHECK: transform.structured.match ops{["scf.for"]} in %{{.*}}
-// CHECK: transform.iree.synchronize_loop %{{.*}}
-// CHECK: transform.structured.hoist_redundant_vector_transfers %{{.*}}
-// CHECK: transform.memref.erase_dead_alloc_and_stores %{{.*}}
-// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync}
-// CHECK: transform.iree.eliminate_gpu_barriers
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK:   transform.apply_patterns.memref.fold_memref_alias_ops
-// CHECK: } : !transform.any_op
-// CHECK: transform.memref.multibuffer %{{.*}} {factor = 3 : i64, skip_analysis}
-// CHECK: transform.apply_patterns.vector.transfer_to_scf full_unroll = true
-// CHECK: transform.iree.create_async_groups %{{.*}} {use_mma_sync}
-// CHECK: transform.iree.pipeline_shared_memory_copies %{{.*}} {depth = 3 : i64, use_mma_sync}
-// CHECK: transform.apply_patterns.vector.lower_masks
-// CHECK: transform.apply_patterns.vector.materialize_masks
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK-DAG:   transform.apply_patterns.linalg.tiling_canonicalization
-// CHECK-DAG:   transform.apply_patterns.memref.fold_memref_alias_ops
-// CHECK-DAG:   transform.apply_patterns.canonicalization
-// CHECK: } : !transform.any_op
-// CHECK: transform.iree.apply_licm
-// CHECK: transform.apply_cse to
-
-// WITH_OPTIONS-LABEL: func @matmul_1
-
-// WITH_OPTIONS: transform.named_sequence
-// WITH_OPTIONS: transform.iree.match_callback failures(propagate) "matmul"
-// Tile sizes are set by td-matmul-strategy-blk-size-XX.
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} tile_sizes [256, 64](mapping = [#gpu.block<y>, #gpu.block<x>])
-// WITH_OPTIONS: transform.structured.fuse_into_containing_op
-// WITH_OPTIONS: transform.iree.populate_workgroup_count_region_using_num_threads_slice
-// The tiling is affected by td-matmul-strategy-reduc-size: 8.
-// WITH_OPTIONS: transform.structured.tile_using_for %{{.*}}[0, 0, 8]
-// WITH_OPTIONS: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// WITH_OPTIONS: transform.structured.hoist_pad %{{.}} by 1 loops
-// WITH_OPTIONS: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}}   num_threads [64, 2](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// WITH_OPTIONS:   transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> ()
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}}   num_threads [8, 16](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// WITH_OPTIONS: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> ()
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}}   num_threads [8, 16](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 1](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 1](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [4, 4]
-// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [1, 4]
-// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [32, 4]
-// WITH_OPTIONS: transform.apply_patterns.vector.lower_masked_transfers
-// WITH_OPTIONS: transform.structured.vectorize_children_and_apply_patterns %{{.*}}
-// WITH_OPTIONS: transform.iree.eliminate_empty_tensors %{{.*}}
-// WITH_OPTIONS: transform.iree.bufferize {target_gpu} %{{.*}}
-// WITH_OPTIONS: transform.iree.forall_to_workgroup %{{.*}}
-// The workgroup dimensions are controled by td-matmul-strategy-num-threads-XX.
-// The warp dimensions are controled by td-matmul-strategy-num-warps-XX.
-// WITH_OPTIONS: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [32, 4, 1]
-// WITH_OPTIONS: transform.iree.hoist_static_alloc %{{.*}}
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.memref.fold_memref_alias_ops
-// WITH_OPTIONS: } : !transform.any_op
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.memref.extract_address_computations
-// WITH_OPTIONS: } : !transform.any_op
-// The unroll attribute should match td-matmul-use-mma-sync, for true: mma_sync,
-// for false:_wmma.
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
-// WITH_OPTIONS: }
-// WITH_OPTIONS: transform.structured.match ops{["scf.for"]} in %{{.*}}
-// WITH_OPTIONS: transform.iree.synchronize_loop %{{.*}}
-// WITH_OPTIONS: transform.structured.hoist_redundant_vector_transfers %{{.*}}
-// WITH_OPTIONS: transform.memref.erase_dead_alloc_and_stores %{{.*}}
-// The attribute should match td-matmul-use-mma-sync.
-// WITH_OPTIONS: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync}
-// WITH_OPTIONS: transform.iree.eliminate_gpu_barriers
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.memref.fold_memref_alias_ops
-// WITH_OPTIONS: } : !transform.any_op
-// The multibuffer pass is only run when we set use-async-copies.
-// The factor should match td-matmul-strategy-pipeline-depth: 5.
-// WITH_OPTIONS: transform.memref.multibuffer %{{.*}} {factor = 5 : i64, skip_analysis}
-// WITH_OPTIONS: transform.apply_patterns.vector.transfer_to_scf full_unroll = true
-// The attribute should match td-matmul-use-mma-sync.
-// WITH_OPTIONS: transform.iree.create_async_groups %{{.*}} {use_mma_sync}
-// The depth should match td-matmul-strategy-pipeline-depth: 5.
-// WITH_OPTIONS: transform.iree.pipeline_shared_memory_copies %{{.*}} {depth = 5 : i64, use_mma_sync}
-// WITH_OPTIONS: transform.apply_patterns.vector.lower_masks
-// WITH_OPTIONS: transform.apply_patterns.vector.materialize_masks
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.linalg.tiling_canonicalization
-// WITH_OPTIONS:   transform.apply_patterns.memref.fold_memref_alias_ops
-// WITH_OPTIONS: } : !transform.any_op
-// WITH_OPTIONS: apply_patterns to %{{.*}} {
-// WITH_OPTIONS:   transform.apply_patterns.canonicalization
-// WITH_OPTIONS  }
-// WITH_OPTIONS: transform.iree.apply_licm
-// WITH_OPTIONS: transform.apply_cse to
-
-
-// WITH_OPTIONS_2-LABEL: func @matmul_1
-
-// WITH_OPTIONS_3-LABEL: func @matmul_1
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul_2() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>> -> tensor<2051x2555xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>> -> tensor<2555x2050xf32>
-  %5 = tensor.empty() : tensor<2051x2050xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
-  return
-}
-
-// CHECK-LABEL: func @matmul_2
-
-// CHECK: transform.named_sequence
-// CHECK: transform.iree.match_callback failures(propagate) "matmul"
-// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
-// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice
-// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16]
-// align1
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [8, 16](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// align2
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 64](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// align2
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 64](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// align1
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [16, 1]
-// align2
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [8, 2]
-// align2
-// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [64, 2]
-
-// WITH_OPTIONS_2-LABEL: func @matmul_2
-
-// WITH_OPTIONS_3-LABEL: func @matmul_2
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul_3() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>> -> tensor<2048x2556xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>> -> tensor<2556x2556xf32>
-  %5 = tensor.empty() : tensor<2048x2556xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
-  return
-}
-
-// CHECK-LABEL: func @matmul_3
-
-// CHECK: transform.named_sequence
-
-// WITH_OPTIONS_2-LABEL: func @matmul_3
-
-// WITH_OPTIONS_3-LABEL: func @matmul_3
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul_4_partially_unaligned() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>> -> tensor<2048x2044xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
-  %5 = tensor.empty() : tensor<2048x1024xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
-  return
-}
-
-// CHECK-LABEL: func @matmul_4_partially_unaligned
-
-// CHECK: transform.structured.tile_using_for %tiled_op tile_sizes [0, 0, 16]
-
-// Make sure we do not canonicalize because the result is still aligned.
-// CHECK-NEXT: transform.structured.pad %tiled_linalg_op
-// CHECK-SAME:   copy_back_op = "none"
-// CHECK-SAME:   nofold_flags = [1, 1, 1]
-// CHECK-SAME:   padding_dimensions = [0, 1, 2]
-// CHECK-SAME:   padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
-// CHECK:      apply_patterns to %{{.*}} {
-// CHECK:        transform.apply_patterns.canonicalization
-// CHECK       }
-// CHECK:      transform.iree.apply_licm
-// CHECK:      transform.apply_cse to
-// CHECK:      %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
-// CHECK:      %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK:      %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
-// CHECK:      %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
-// CHECK:      %[[TILED_LHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[LHS_PAD]]   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:      transform.structured.match ops{["scf.if"]}
-// CHECK:      transform.scf.take_assumed_branch %{{.*}} take_else_branch
-// CHECK:      %[[TILED_RHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[RHS_PAD]]   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:      transform.structured.match ops{["scf.if"]}
-// CHECK:      transform.scf.take_assumed_branch %{{.*}} take_else_branch
-// CHECK:      transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK:      transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK:        transform.apply_patterns.canonicalization
-// CHECK       }
-// CHECK:      transform.iree.apply_licm
-// CHECK:      transform.apply_cse to
-
-// alignLhs
-// CHECK:      transform.structured.vectorize %[[TILED_LHS]] vector_sizes [4, 4]
-// alignRhs
-// CHECK:      transform.structured.vectorize %[[TILED_RHS]] vector_sizes [4, 4]
-
-// CHECK:      transform.apply_patterns.vector.lower_masks
-// CHECK:      transform.apply_patterns.vector.materialize_masks
-
-// WITH_OPTIONS_2-LABEL: func @matmul_4_partially_unaligned
-
-// WITH_OPTIONS_3-LABEL: func @matmul_4_partially_unaligned
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @aligned_matmul() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
-  %5 = tensor.empty() : tensor<2048x2048xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
-  return
-}
-
-// CHECK-LABEL: func @aligned_matmul
-
-// Block level is the same for aligned.
-// CHECK: transform.structured.tile_using_for %tiled_op tile_sizes [0, 0, 16]
-
-// Make sure we do not canonicalize if the result is aligned to avoid folding the extract_slice on the iterator.
-// CHECK-NEXT: transform.structured.pad %tiled_linalg_op
-// CHECK-SAME:   copy_back_op = "none"
-// CHECK-SAME:   nofold_flags = [1, 1, 1]
-// CHECK-SAME:   padding_dimensions = [0, 1, 2]
-// CHECK-SAME:   padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
-
-// Canonicalization is currently required here to enable pad to dps to produce linalg.copy ops.
-// CHECK:      apply_patterns to %{{.*}} {
-// CHECK:        transform.apply_patterns.canonicalization
-// CHECK       }
-// CHECK:      transform.iree.apply_licm
-// CHECK:      transform.apply_cse to
-// CHECK:      %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
-// CHECK:      %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK:      %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
-// CHECK:      %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
-// CHECK:      %[[LHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[LHS_PAD]]
-// CHECK:      %[[RHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS_PAD]]
-// CHECK:      transform.structured.tile_using_forall %[[LHS_COPY]]   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:      transform.structured.tile_using_forall %[[RHS_COPY]]   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:      transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK:      transform.structured.tile_using_forall %{{.*}}   num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>])
-// CHECK:        transform.apply_patterns.canonicalization
-// CHECK       }
-// CHECK:      transform.iree.apply_licm
-// CHECK:      transform.apply_cse to
-
-// Verify we don't go down the path without the flag.
-// WITH_OPTIONS-LABEL: func @aligned_matmul
-
-// WITH_OPTIONS-NOT: transform.sequence
-// WITH_OPTIONS-NOT: transform.named_sequence
-
-// WITH_OPTIONS_2-LABEL: func @aligned_matmul
-
-// WITH_OPTIONS_3-LABEL: func @aligned_matmul
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul_5_small() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>> -> tensor<2x2044xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
-  %5 = tensor.empty() : tensor<2x1024xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
-  return
-}
-
-// CHECK:       iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32>
-// CHECK-LABEL: func @matmul_5_small
-
-// This matmul is considered "too small"/"degenerate" for a tensor core strategy,
-// just fallback to the vectorized strategy.
-
-// WITH_OPTIONS_2-LABEL: func @matmul_5_small
-
-// WITH_OPTIONS_3-LABEL: func @matmul_5_small
-
-// SMALL-LABEL: func @matmul_5_small
-// SMALL: transform.named_sequence
-// SMALL-NOT: mma
-// SMALL-NOT: wmma
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @f16_matmul() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f16
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>> -> tensor<2052x2556xf16>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>> -> tensor<2556x2052xf16>
-  %5 = tensor.empty() : tensor<2052x2052xf16>
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
-  return
-}
-
-// CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK-LABEL: func @f16_matmul
-// CHECK-NOT: transform.sequence
-// CHECK-NOT: transform.named_sequence
-
-// WITH_OPTIONS_2-LABEL: func @f16_matmul
-
-// WITH_OPTIONS_3-LABEL: func @f16_matmul
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @int8_matmul() {
-  %c0 = arith.constant 0 : index
-  %c0_i8 = arith.constant 0 : i8
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>> -> tensor<4x2556xi8>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>> -> tensor<2556x2052xi8>
-  %5 = tensor.empty() : tensor<4x2052xi8>
-  %6 = linalg.fill ins(%c0_i8 : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
-  %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
-  return
-}
-
-// SMALL-LABEL: func @int8_matmul
-// SMALL: transform.named_sequence
-// SMALL-NOT: mma
-// SMALL-NOT: wmma
-
-// CHECK-LABEL: func @int8_matmul
-// CHECK-NOT: transform.sequence
-// CHECK-NOT: transform.named_sequence
-
-// WITH_OPTIONS-LABEL: func @int8_matmul
-// WITH_OPTIONS-NOT: transform.sequence
-// WITH_OPTIONS-NOT: transform.named_sequence
-
-// WITH_OPTIONS_2-LABEL: func @int8_matmul
-// WITH_OPTIONS_2-NOT: transform.sequence
-// WITH_OPTIONS_2-NOT: transform.named_sequence
-
-// WITH_OPTIONS_3-LABEL: func @int8_matmul
-// WITH_OPTIONS_3-NOT: transform.sequence
-// WITH_OPTIONS_3-NOT: transform.named_sequence
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
deleted file mode 100644
index 599ea923d988..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ /dev/null
@@ -1,150 +0,0 @@
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true \
-// RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:   --iree-gpu-test-target=sm_80 \
-// RUN:   --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \
-// RUN: | FileCheck %s
-
-// Check that setting the command line options affect the transform
-// strategy as expected.
-// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true \
-// RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
-// RUN:   --iree-gpu-test-target=sm_80 \
-// RUN:   --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \
-// RUN:   --td-pad-strategy-blk-sizes=16,32,1 \
-// RUN:   --td-pad-strategy-num-threads=8,4,1 \
-// RUN:   --td-pad-strategy-vector-size=2,4 \
-// RUN:   --td-pad-strategy-use-async-copies=false \
-// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @pad() {
-  %c0 = arith.constant 0 : index
-  %c56 = arith.constant 56 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %padded = tensor.pad %2 low[%c0, 0] high[5, %c56] {
-  ^bb0(%arg0: index, %arg1: index):
-    tensor.yield %cst_0 : f32
-  } : tensor<123x456xf32> to tensor<128x512xf32>
-  flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  return
-}
-
-// CHECK-LABEL: func @pad
-//       CHECK:   transform.named_sequence
-//       CHECK:   transform.iree.register_match_callbacks
-//       CHECK:   {{.*}} = transform.iree.match_callback failures(propagate) "pad"({{.*}}) : (!transform.any_op) -> !transform.any_op
-//       CHECK:   transform.structured.tile_using_forall {{.*}}   tile_sizes [64, 64](mapping = [#gpu.block<y>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-//       CHECK:   apply_patterns to %{{.*}} {
-//       CHECK:     transform.apply_patterns.canonicalization
-//       CHECK    }
-//       CHECK:   transform.iree.apply_licm
-//       CHECK:   transform.apply_cse to
-//       CHECK:   {{.*}} = transform.structured.match ops{["scf.if"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   transform.scf.take_assumed_branch {{.*}} take_else_branch : (!transform.any_op) -> ()
-//       CHECK:   transform.iree.populate_workgroup_count_region_using_num_threads_slice {{.*}} : (!transform.any_op) -> ()
-//       CHECK:   {{.*}} = transform.structured.tile_using_forall {{.*}}   num_threads [16, 16](mapping = [#gpu.thread<y>, #gpu.thread<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-//       CHECK:   apply_patterns to %{{.*}} {
-//       CHECK:     transform.apply_patterns.canonicalization
-//       CHECK    }
-//       CHECK:   transform.iree.apply_licm
-//       CHECK:   transform.apply_cse to
-//       CHECK:   {{.*}} = transform.structured.match ops{["scf.if"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   transform.scf.take_assumed_branch {{.*}} take_else_branch : (!transform.any_op) -> ()
-//       CHECK:   transform.structured.vectorize {{.*}} vector_sizes [4, 4] : !transform.any_op
-//       CHECK:   {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:     transform.apply_patterns.vector.lower_masked_transfers
-//       CHECK:   apply_patterns to %{{.*}} {
-//   CHECK-DAG:     transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-//   CHECK-DAG:     transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-//   CHECK-DAG:     transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-//       CHECK:   } : !transform.any_op
-//       CHECK:   {{.*}} = transform.structured.vectorize_children_and_apply_patterns {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   apply_patterns to %{{.*}} {
-//       CHECK:     transform.apply_patterns.canonicalization
-//       CHECK    }
-//       CHECK:   transform.iree.apply_licm
-//       CHECK:   transform.apply_cse to
-//       CHECK:   transform.iree.eliminate_empty_tensors {{.*}} : (!transform.any_op) -> ()
-//       CHECK:   {{.*}} = transform.iree.bufferize {target_gpu} {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   transform.memref.erase_dead_alloc_and_stores {{.*}} : (!transform.any_op) -> ()
-//       CHECK:   {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
-//       CHECK:   transform.iree.forall_to_workgroup {{.*}} : (!transform.any_op) -> ()
-//       CHECK:   transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] {{.*}}: (!transform.any_op) -> ()
-//       CHECK:     transform.apply_patterns.vector.lower_masks
-//       CHECK:     transform.apply_patterns.vector.materialize_masks
-//       CHECK:   apply_patterns to %{{.*}} {
-//   CHECK-DAG:     transform.apply_patterns.linalg.tiling_canonicalization
-//   CHECK-DAG:     transform.apply_patterns.memref.fold_memref_alias_ops
-//   CHECK-DAG:     transform.apply_patterns.canonicalization
-//       CHECK:   } : !transform.any_op
-//       CHECK:   transform.iree.apply_licm
-//       CHECK:   transform.apply_cse to
-
-// WITH_OPTIONS-LABEL: func @pad
-//       WITH_OPTIONS:   transform.structured.tile_using_forall {{.*}}   tile_sizes [32, 16](mapping = [#gpu.block<y>, #gpu.block<x>])
-//       WITH_OPTIONS:   {{.*}} = transform.structured.tile_using_forall {{.*}}   num_threads [4, 8](mapping = [#gpu.thread<y>, #gpu.thread<x>])
-//       WITH_OPTIONS:   transform.structured.vectorize {{.*}} vector_sizes [2, 4] : !transform.any_op
-//       WITH_OPTIONS:   transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [8, 4, 1]
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @pad_low() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %padded = tensor.pad %2 low[5, 0] high[0, 56] {
-  ^bb0(%arg0: index, %arg1: index):
-    tensor.yield %cst_0 : f32
-  } : tensor<123x456xf32> to tensor<128x512xf32>
-  flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  return
-}
-
-// The strategy doesn't apply for low padding.
-// CHECK-LABEL: @pad_low
-// CHECK-NOT: transform.iree
-// WITH_OPTIONS-LABEL: @pad_low
-// WITH_OPTIONS-NOT: transform.iree
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @pad_local() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-  %padded = tensor.pad %2 low[0, 0] high[5, 56] {
-  ^bb0(%arg0: index, %arg1: index):
-    %3 = arith.index_cast %arg0 : index to i64
-    %4 = arith.uitofp %3 : i64 to f32
-    tensor.yield %4 : f32
-  } : tensor<123x456xf32> to tensor<128x512xf32>
-  flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
-  return
-}
-
-// The strategy doesn't apply for local pad values.
-// CHECK-LABEL: @pad_local
-// CHECK-NOT: transform.iree
-// WITH_OPTIONS-LABEL: @pad_local
-// WITH_OPTIONS-NOT: transform.iree
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
index ccd545988581..d264a26551f9 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
@@ -92,7 +92,6 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU",
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
index 7f4ccd972613..08ec5885dc97 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
@@ -141,7 +141,6 @@ iree_cc_library(
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
-    iree::compiler::Codegen::TransformStrategies::GPU
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Dialect::Flow::IR
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index 568365965cff..16a1acf4316f 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -10,7 +10,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -45,11 +44,6 @@ constexpr int kMaxVectorNumBits = 128;
 
 namespace mlir::iree_compiler {
 
-llvm::cl::opt<bool> clSPIRVEnableTransformDialectJit(
-    "iree-spirv-enable-transform-dialect-jit",
-    llvm::cl::desc("Enable the usage of the transform dialect JIT"),
-    llvm::cl::init(false));
-
 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
 
 //===----------------------------------------------------------------------===//
@@ -1490,47 +1484,6 @@ static LogicalResult setDefaultOpConfig(IREE::GPU::TargetAttr target,
                                                workgroupSize);
 }
 
-//===----------------------------------------------------------------------===//
-// Transform Dialect Specialized Configurations
-//===----------------------------------------------------------------------===//
-
-static LogicalResult
-setTransformDialectConfig(mlir::FunctionOpInterface entryPoint, Operation *op,
-                          IREE::GPU::TargetAttr target) {
-  if (!clSPIRVEnableTransformDialectJit) {
-    return failure();
-  }
-
-  MLIRContext *context = entryPoint.getContext();
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      context, CodeGenPipeline::TransformDialectCodegen);
-
-  // TODO: unify the target information into one structure.
-  iree_compiler::gpu::GPUModel gpuModel;
-  gpuModel.hasWarpShuffle = target.supportsSubgroupShuffle();
-  gpuModel.hasTF32TensorCore = false;
-  gpuModel.hasMmaSync = false;
-  gpuModel.hasTF32TensorCore = false;
-  gpuModel.minSubgroupSize = target.getMinSubgroupSize();
-  gpuModel.maxSubgroupSize = target.getMaxSubgroupSize();
-  gpuModel.maxWorkGroupInvocations =
-      target.getWgp().getMaxThreadCountPerWorkgroup();
-
-  // Populates the supported WMMA fragment combinations from the target
-  // environment. Infer tf32 support from the list of supported fragment types.
-  for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
-    auto [mSize, nSize, kSize] = mma.getMNKShape();
-    auto [aType, bType, cType] = mma.getABCElementTypes();
-    gpuModel.supportedWMMAConfigs.emplace_back(iree_compiler::gpu::MMAConfig{
-        mSize, nSize, kSize, aType, bType, cType});
-  }
-
-  if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op,
-                                                              gpuModel)))
-    return failure();
-  return setTranslationInfo(entryPoint, translationInfo);
-}
-
 //===----------------------------------------------------------------------===//
 // Configuration Dispatcher
 //===----------------------------------------------------------------------===//
@@ -1540,11 +1493,6 @@ setTransformDialectConfig(mlir::FunctionOpInterface entryPoint, Operation *op,
 static LogicalResult setSPIRVOpConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPointFn,
                                       Operation *rootOp) {
-  // First try to see if there is a matching transform dialect configuration.
-  if (succeeded(setTransformDialectConfig(entryPointFn, rootOp, target))) {
-    return success();
-  }
-
   // First try to find a proper CodeGen configuration to tile and vectorize for
   // the current target architecture.
   if (target.isAMD() && succeeded(detail::setAMDCodeGenConfig(target, rootOp)))
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
index ef68e53ffe4f..219ec01dd8b6 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
@@ -9,16 +9,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
 #include "iree/compiler/Codegen/SPIRV/Passes.h"
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 
@@ -42,15 +33,8 @@ class SPIRVSelectLoweringStrategyPass final
       SPIRVSelectLoweringStrategyPass>::SPIRVSelectLoweringStrategyPassBase;
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    // TODO(qedawkins): Once TransformStrategies is deprecated, drop the
-    // unnecessary dialect registrations.
-    registry
-        .insert<IREE::Codegen::IREECodegenDialect, IREE::GPU::IREEGPUDialect,
-                affine::AffineDialect, gpu::GPUDialect, IREE::HAL::HALDialect,
-                linalg::LinalgDialect, IREE::LinalgExt::IREELinalgExtDialect,
-                memref::MemRefDialect, bufferization::BufferizationDialect,
-                scf::SCFDialect, spirv::SPIRVDialect,
-                transform::TransformDialect, vector::VectorDialect>();
+    registry.insert<IREE::Codegen::IREECodegenDialect,
+                    IREE::GPU::IREEGPUDialect, spirv::SPIRVDialect>();
   }
 
   void runOnOperation() override;
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
index 3886c6e20938..e4807d9310df 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
@@ -59,7 +59,6 @@ iree_lit_test_suite(
             "pipeline_reduction_subgroup.mlir",
             "pipeline_sub_byte_dequant.mlir",
             "physical_storage_buffer_addresses.mlir",
-            "set_transform_strategy.mlir",
             "tile_and_distribute.mlir",
             "tile_and_distribute_scatter.mlir",
             "tile_and_distribute_sort.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 078f92ab37b3..f28a588339ca 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -55,7 +55,6 @@ iree_lit_test_suite(
     "pipeline_matvec.mlir"
     "pipeline_reduction_subgroup.mlir"
     "pipeline_sub_byte_dequant.mlir"
-    "set_transform_strategy.mlir"
     "tile_and_distribute.mlir"
     "tile_and_distribute_scatter.mlir"
     "tile_and_distribute_sort.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
deleted file mode 100644
index d32855d538b7..000000000000
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: iree-opt %s --split-input-file --iree-gpu-test-target=volta@vulkan \
-// RUN:   --pass-pipeline="builtin.module(iree-spirv-select-lowering-strategy-pass)"\
-// RUN:   --iree-spirv-enable-transform-dialect-jit=true
-
-// TODO: Transform script based CodeGen expects fp32-input to target tensor
-// core, but there are no such wmma intrinsics. Fix it to support fp16-input.
-// TODO: | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @matmul() {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
-  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
-  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
-  %5 = tensor.empty() : tensor<2052x2052xf32>
-  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
-  %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
-  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
-  return
-}
-
-// CHECK-LABEL: func @matmul
-
-// CHECK: transform.named_sequence
-
-/// The specific vector sizes are tested in the LLVMGPU tests and thus omitted
-/// here. This is just to check that masked vectorization is used.
-// CHECK-COUNT-3: transform.structured.vectorize
-
-// Verify use of WMMA.
-// CHECK: apply_patterns to %{{.*}} {
-// CHECK:   transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
-// CHECK: } : !transform.any_op
-// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_wmma}
-
-// Verify asynchronous copy is not used.
-// CHECK-NOT: transform.iree.create_async_groups
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel
deleted file mode 100644
index 236a47446725..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright 2020 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt
deleted file mode 100644
index d74a77855614..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel           #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel
deleted file mode 100644
index bf6645762b0d..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_compiler_cc_library(
-    name = "CPU",
-    srcs = [
-        "Common.cpp",
-        "ReductionStrategy.cpp",
-    ],
-    hdrs = [
-        "Common.h",
-        "ReductionStrategy.h",
-    ],
-    deps = [
-        # Dialects
-        "//compiler/src/iree/compiler/Dialect/Flow/IR",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions",
-        "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:PDLDialect",
-        "@llvm-project//mlir:PDLInterpDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFUtils",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformDialect",
-        "@llvm-project//mlir:VectorDialect",
-        # IR
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Rewrite",
-        # Interfaces
-        # Transforms (needed mostly for the BufferizableOpInterfaceImpl)
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:SCFTransforms",
-        "@llvm-project//mlir:TensorTransforms",
-        "@llvm-project//mlir:VectorTransforms",
-        # Other Stuff
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:DialectUtils",
-        # TransformStrategies
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies",
-        # TransformExtensions
-        "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions",
-        "//compiler/src/iree/compiler/Codegen/LLVMCPU/TransformExtensions:LLVMCPUExtensions",
-        "@llvm-project//mlir:LinalgTransformOps",
-        "@llvm-project//mlir:VectorTransformOps",
-        # TransformMatchers and other stuff
-        "//llvm-external-projects/iree-dialects:IREEDialectsTransforms",
-    ],
-)
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt
deleted file mode 100644
index 06ac540e9d91..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel       #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_cc_library(
-  NAME
-    CPU
-  HDRS
-    "Common.h"
-    "ReductionStrategy.h"
-  SRCS
-    "Common.cpp"
-    "ReductionStrategy.cpp"
-  DEPS
-    IREEDialectsTransforms
-    IREELinalgTransformDialect
-    LLVMSupport
-    MLIRAffineDialect
-    MLIRAffineUtils
-    MLIRAnalysis
-    MLIRArithDialect
-    MLIRArithTransforms
-    MLIRArithUtils
-    MLIRAsyncDialect
-    MLIRBufferizationDialect
-    MLIRBufferizationTransforms
-    MLIRFuncDialect
-    MLIRFunctionInterfaces
-    MLIRGPUDialect
-    MLIRIR
-    MLIRLLVMDialect
-    MLIRLinalgDialect
-    MLIRLinalgTransformOps
-    MLIRLinalgTransforms
-    MLIRPDLDialect
-    MLIRPDLInterpDialect
-    MLIRParser
-    MLIRPass
-    MLIRRewrite
-    MLIRSCFDialect
-    MLIRSCFTransforms
-    MLIRSCFUtils
-    MLIRSupport
-    MLIRTensorDialect
-    MLIRTensorTransforms
-    MLIRTransformDialect
-    MLIRVectorDialect
-    MLIRVectorTransformOps
-    MLIRVectorTransforms
-    iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions
-    iree::compiler::Codegen::LLVMCPU::TransformExtensions::LLVMCPUExtensions
-    iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies
-    iree::compiler::Dialect::Flow::IR
-    iree::compiler::Dialect::LinalgExt::IR
-    iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions
-  PUBLIC
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp
deleted file mode 100644
index 0b3d1fb33294..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMCPU/TransformExtensions/LLVMCPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-// TODO: significantly better namespacing.
-using iree_compiler::cpu::CPUModel;
-using iree_compiler::cpu::ReductionConfig;
-using iree_compiler::cpu::ReductionStrategy;
-using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using transform::ApplyLowerContractionPatternsOp;
-using transform::ApplyLowerMultiReductionPatternsOp;
-using transform::ApplyLowerShapeCastPatternsOp;
-using transform::ApplyLowerTransferPatternsOp;
-using transform::ApplyLowerTransposePatternsOp;
-using transform::ApplySplitTransferFullPartialPatternsOp;
-using transform::ApplyTransferPermutationPatternsOp;
-using transform::ApplyTransferToScfPatternsOp;
-using transform::MatchOp;
-using transform::SplitHandleOp;
-using transform_ext::AllDims;
-using transform_ext::m_StructuredOp;
-using transform_ext::NumEqualsTo;
-using transform_ext::RegisterMatchCallbacksOp;
-using transform_ext::ShapeKind;
-using transform_ext::StructuredOpMatcher;
-using vector::VectorContractLoweringAttr;
-
-//===----------------------------------------------------------------------===//
-// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-
-// TODO: better builders.
-static Value buildDefaultVectorLoweringStrategy(
-    ImplicitLocOpBuilder &b, Value funcH,
-    const vector::LowerVectorsOptions &lowerVectorsOpts) {
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerContractionPatternsOp>(
-        loc, lowerVectorsOpts.vectorContractLowering);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyTransferPermutationPatternsOp>(loc);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerMultiReductionPatternsOp>(
-        loc, lowerVectorsOpts.vectorMultiReductionLowering);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplySplitTransferFullPartialPatternsOp>(
-        loc, lowerVectorsOpts.vectorTransferSplit);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyTransferToScfPatternsOp>(
-        loc, /*maxTransferRank=*/1,
-        /*fullUnroll=*/lowerVectorsOpts.unrollVectorTransfers);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerTransferPatternsOp>(loc,
-                                                      /*maxTransferRank=*/1);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerShapeCastPatternsOp>(loc);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerTransposePatternsOp>(
-        loc, /*loweringStrategy=*/lowerVectorsOpts.vectorTransposeLowering,
-        /*avx2LoweringStrategy=*/lowerVectorsOpts.transposeAVX2Lowering);
-  });
-  return funcH;
-}
-
-/// Take care of the last common steps in a CPU strategy (i.e. vectorize,
-/// bufferize and map to blocks).
-/// Return the handles to the updated variant and the function ops under
-/// the variant op.
-std::pair<Value, Value> mlir::iree_compiler::cpu::buildCommonTrailingStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const vector::LowerVectorsOptions &lowerVectorsOpts) {
-  Value funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-
-  // Step N-5. Fold tensor.empty to avoid large allocations.
-  // Step N-4. Perform a pass of canonicalization + enabling after tiling.
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldTensorEmptyPatternsOp>(loc);
-      });
-  funcH = iree_compiler::buildVectorize(b, funcH);
-
-  // Step N-3. Perform a pass of canonicalization + enabling after vectorization
-  // as well as hoisting subset operations such as vector.transfer_read/write.
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldTensorEmptyPatternsOp>(loc);
-      });
-  iree_compiler::buildHoisting(b, funcH);
-
-  // Step N-2. Bufferize and drop HAL descriptor from memref ops.
-  variantH = iree_compiler::buildBufferize(b, variantH);
-
-  // Step N-1. Post-bufferization mapping to blocks only.
-  // Need to match again since bufferize invalidated all handles.
-  // TODO: assumes a single function to transform, may need hardening.
-  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  b.create<ForallToWorkgroupOp>(funcH);
-
-  // Step N. Lower vectors.
-  funcH = buildDefaultVectorLoweringStrategy(b, funcH, lowerVectorsOpts);
-  return std::make_pair(variantH, funcH);
-}
-
-//===----------------------------------------------------------------------===//
-// Higher-level problem-specific strategy creation APIs, these should favor
-// user-friendliness.
-//===----------------------------------------------------------------------===//
-
-static ReductionConfig
-getReductionConfig(const transform_ext::MatchedReductionCaptures &captures,
-                   const CPUModel &cpuModel) {
-  return ReductionConfig{16};
-}
-
-LogicalResult iree_compiler::cpu::matchAndSetReductionStrategy(
-    mlir::FunctionOpInterface entryPoint, linalg::LinalgOp op,
-    const CPUModel &cpuModel) {
-  // 1. Match a reduction and surrounding ops.
-  StructuredOpMatcher *reduction;
-  transform_ext::MatchedReductionCaptures captures;
-  transform_ext::MatcherContext matcherContext;
-  makeReductionMatcher(matcherContext, reduction, captures,
-                       /*mustMatchEntireFunc=*/true);
-  if (!matchPattern(op, *reduction))
-    return failure();
-
-  // 2. Construct the configuration and the strategy builder.
-  // TODO: Generalize along the HW axis.
-  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
-    ReductionConfig reductionConfig = getReductionConfig(captures, cpuModel);
-    ReductionStrategy strategy(captures, reductionConfig);
-    return buildReductionStrategy(b, variant, strategy);
-  };
-
-  // 3. Build strategy embedded into the IR.
-  createTransformRegion(entryPoint, strategyBuilder);
-
-  return success();
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h
deleted file mode 100644
index c663ad87b3f8..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-
-namespace mlir::iree_compiler::cpu {
-
-//===----------------------------------------------------------------------===//
-// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-/// Take care of the last common steps in a CPU strategy (i.e. vectorize,
-/// bufferize, maps to blocks/workgroups and lower vectors).
-/// Return the handles to the updated variant and the function ops under
-/// the variant op.
-// TODO: pass control to LowerVectorsOp once the builder allows it.
-std::pair<Value, Value> buildCommonTrailingStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const vector::LowerVectorsOptions &lowerVectorsOpts);
-
-//===----------------------------------------------------------------------===//
-// Higher-level problem-specific strategy creation APIs, these should favor
-// user-friendliness.
-//===----------------------------------------------------------------------===//
-/// Placeholder for some hardware model proxy that contains relevant information
-/// to configure the reduction strategy. In the future, this will need to be
-/// driven by some contract with the runtime.
-struct CPUModel {
-  static constexpr StringLiteral kDefaultCPU = "DefaultCPU";
-  StringRef model = kDefaultCPU;
-};
-
-/// Map an N-D parallel, 1-D reduction operation with optional leading and
-/// optional trailing elementwise operations.
-/// The 1-D reduction dimension must be in the most minor dimension.
-/// The innermost dimensions of the leading and trailing operations must be most
-/// minor along all accesses.
-/// Return failure if matching fails.
-/// On a successful match, configure a reduction strategy based on a proxy model
-/// of the hardware and construct transform dialect IR that implements the
-/// reduction strategy. The transform dialect IR is added in a top-level
-/// ModuleOp after the `entryPoint` function.
-LogicalResult matchAndSetReductionStrategy(mlir::FunctionOpInterface entryPoint,
-                                           linalg::LinalgOp op,
-                                           const CPUModel &cpuModel);
-
-} // namespace mlir::iree_compiler::cpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp
deleted file mode 100644
index f5998663f2fd..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMCPU/TransformExtensions/LLVMCPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-// TODO: significantly better namespacing.
-using iree_compiler::cpu::ReductionConfig;
-using iree_compiler::cpu::ReductionStrategy;
-using transform_ext::RegisterMatchCallbacksOp;
-
-mlir::iree_compiler::cpu::ReductionStrategy::ReductionStrategy(
-    const transform_ext::MatchedReductionCaptures &captures,
-    const ReductionConfig &reductionConfig)
-    : AbstractReductionStrategy(captures, {}) {
-  configure(reductionConfig);
-  LLVM_DEBUG(DBGS() << "use CPU reduction strategy\n");
-}
-
-void mlir::iree_compiler::cpu::ReductionStrategy::configure(
-    const ReductionConfig &config) {
-  // Block-level
-  // ===========
-  // Tile all the parallel dimensions to 8 for now.
-  int64_t numParallelLoops = captures.reductionRank - 1;
-  workgroupTileSizes.append(numParallelLoops, 8);
-  vectorSize = config.vectorSize;
-}
-
-/// Builds the transform IR tiling reductions for CUDA targets. Supports
-/// reductions in the last dimension, with optional leading and trailing
-/// elementwise operations.
-void mlir::iree_compiler::cpu::buildReductionStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const ReductionStrategy &strategy) {
-  // Step 1. Tiling to the block/workgroup level. Keep everything fused.
-  auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock,
-        forall] =
-      buildReductionStrategyBlockDistribution(b, variantH,
-                                              strategy.workgroupTileSizes);
-
-  // Step 2. Naive first strategy to tile the most minor dimension by
-  // strategy.getVectorSize().
-  for (auto [val, rank] : SmallVector<std::pair<Value, int64_t>>{
-           {maybeLeadingHBlock, strategy.captures.maybeLeadingRank},
-           {gridReductionH, strategy.captures.reductionRank},
-           {maybeTiledTrailingHBlock, strategy.captures.maybeTrailingRank}}) {
-    if (rank == 0)
-      continue;
-    SmallVector<int64_t> tileSizes(rank - 1, 0);
-    tileSizes.push_back(strategy.getVectorSize());
-    buildTileFuseToScfFor(b, variantH, val, {},
-                          getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)));
-  }
-
-  // Step 3-5. Common trailing steps.
-  vector::LowerVectorsOptions lowerVectorsOptions;
-  lowerVectorsOptions
-      .setVectorTransformsOptions(vector::VectorContractLowering::OuterProduct)
-      .setVectorMultiReductionLowering(
-          vector::VectorMultiReductionLowering::InnerParallel)
-      .setVectorTransferSplit(vector::VectorTransferSplit::LinalgCopy)
-      .setVectorTransposeLowering(vector::VectorTransposeLowering::EltWise)
-      .setTransposeAVX2Lowering(false)
-      .setUnrollVectorTransfers(true);
-  buildCommonTrailingStrategy(b, variantH, lowerVectorsOptions);
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h
deleted file mode 100644
index 282f3ced55b9..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_
-
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-
-namespace mlir::iree_compiler::cpu {
-
-struct CPUModel;
-
-/// Structure to hold a summary of HW-derived properties to configure the
-/// reduction strategy.
-/// The objective of this struct is to act as a minimal summary of key
-/// properties derived from the hardware (e.g. by an oracle) and that are
-/// sufficient to steer the strategy to produce a good version.
-/// These can be thought of as latent variables or embeddings that directly
-/// control the strategy and can be derived from the hardware by some procedure.
-struct ReductionConfig {
-  int64_t vectorSize;
-};
-
-/// A simple CPU ReductionStrategy.
-class ReductionStrategy : public iree_compiler::AbstractReductionStrategy {
-public:
-  ReductionStrategy(const transform_ext::MatchedReductionCaptures &captures,
-                    const ReductionConfig &reductionConfig);
-
-  ReductionStrategy(const ReductionStrategy &) = default;
-  ReductionStrategy &operator=(const ReductionStrategy &) = default;
-
-  int64_t getVectorSize() const { return vectorSize; }
-
-private:
-  /// Compute the small strategy based on the problem size.
-  void configure(const ReductionConfig &config);
-
-  /// Vector size.
-  int64_t vectorSize;
-};
-
-/// Entry point to build the transform IR corresponding to a reduction strategy.
-/// This is used to map an N-D parallel, 1-D reduction operation with optional
-/// leading and optional trailing elementwise operations.
-/// The 1-D reduction dimension must be in the most minor dimension.
-/// The innermost dimensions of the leading and trailing operations must be most
-/// minor along all accesses.
-void buildReductionStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                            const ReductionStrategy &strategy);
-
-} // namespace mlir::iree_compiler::cpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h
deleted file mode 100644
index d89ffeed06ea..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_
-
-#include "iree-dialects/Transforms/TransformMatchers.h"
-
-namespace mlir::iree_compiler {
-
-/// Structure to hold the parameters that control the reduction strategy.
-struct AbstractReductionStrategy {
-  AbstractReductionStrategy(
-      const transform_ext::MatchedReductionCaptures &captures,
-      ArrayRef<int64_t> workgroupTileSizes)
-      : captures(captures), workgroupTileSizes(workgroupTileSizes) {}
-
-  /// Constructor quantities.
-  transform_ext::MatchedReductionCaptures captures;
-
-  /// Tile sizes for the workgroup / determines grid size for all known
-  /// reduction strategies.
-  SmallVector<int64_t> workgroupTileSizes;
-};
-
-} // namespace mlir::iree_compiler
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel
deleted file mode 100644
index 6771d9269026..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_compiler_cc_library(
-    name = "TransformStrategies",
-    srcs = [
-        "Common.cpp",
-    ],
-    hdrs = [
-        "AbstractReductionStrategy.h",
-        "Common.h",
-    ],
-    deps = [
-        # Dialects
-        "//compiler/src/iree/compiler/Dialect/Flow/IR",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions",
-        "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:PDLDialect",
-        "@llvm-project//mlir:PDLInterpDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFUtils",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformDialect",
-        "@llvm-project//mlir:VectorDialect",
-        # IR
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Rewrite",
-        # Interfaces
-        # Transforms (needed mostly for the BufferizableOpInterfaceImpl)
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MemRefTransformOps",
-        "@llvm-project//mlir:SCFTransforms",
-        "@llvm-project//mlir:SCFTransformOps",
-        "@llvm-project//mlir:TensorTransforms",
-        "@llvm-project//mlir:TensorTransformOps",
-        "@llvm-project//mlir:TransformLoopExtension",
-        "@llvm-project//mlir:VectorTransforms",
-        "@llvm-project//mlir:VectorTransformOps",
-        # Other Stuff
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:DialectUtils",
-        # TransformExtensions
-        "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions",
-        # TransformMatchers and other stuff
-        "//llvm-external-projects/iree-dialects:IREEDialectsTransforms",
-    ],
-)
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt
deleted file mode 100644
index 0198dab69c14..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel    #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_cc_library(
-  NAME
-    TransformStrategies
-  HDRS
-    "AbstractReductionStrategy.h"
-    "Common.h"
-  SRCS
-    "Common.cpp"
-  DEPS
-    IREEDialectsTransforms
-    IREELinalgTransformDialect
-    LLVMSupport
-    MLIRAffineDialect
-    MLIRAffineUtils
-    MLIRAnalysis
-    MLIRArithDialect
-    MLIRArithTransforms
-    MLIRArithUtils
-    MLIRAsyncDialect
-    MLIRBufferizationDialect
-    MLIRBufferizationTransforms
-    MLIRFuncDialect
-    MLIRFunctionInterfaces
-    MLIRGPUDialect
-    MLIRIR
-    MLIRLLVMDialect
-    MLIRLinalgDialect
-    MLIRLinalgTransforms
-    MLIRMemRefTransformOps
-    MLIRPDLDialect
-    MLIRPDLInterpDialect
-    MLIRParser
-    MLIRPass
-    MLIRRewrite
-    MLIRSCFDialect
-    MLIRSCFTransformOps
-    MLIRSCFTransforms
-    MLIRSCFUtils
-    MLIRSupport
-    MLIRTensorDialect
-    MLIRTensorTransformOps
-    MLIRTensorTransforms
-    MLIRTransformDialect
-    MLIRTransformLoopExtension
-    MLIRVectorDialect
-    MLIRVectorTransformOps
-    MLIRVectorTransforms
-    iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions
-    iree::compiler::Dialect::Flow::IR
-    iree::compiler::Dialect::LinalgExt::IR
-    iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions
-  PUBLIC
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
deleted file mode 100644
index 75d492f859de..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ /dev/null
@@ -1,485 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h"
-#include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
-
-// TODO: significantly better namespacing.
-using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using iree_compiler::IREE::transform_dialect::IREEBufferizeOp;
-using iree_compiler::IREE::transform_dialect::IREEEliminateEmptyTensorsOp;
-using iree_compiler::IREE::transform_dialect::
-    PopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
-using transform::FuseIntoContainingOp;
-using transform::HoistLoopInvariantSubsetsOp;
-using transform::MatchOp;
-using transform::MemRefEraseDeadAllocAndStoresOp;
-using transform::MergeHandlesOp;
-using transform::NamedSequenceOp;
-using transform::PrintOp;
-using transform::SplitHandleOp;
-using transform::SplitReductionOp;
-using transform::TileUsingForallOp;
-using transform::VectorizeChildrenAndApplyPatternsOp;
-using transform_ext::RegisterMatchCallbacksOp;
-using transform_ext::TakeFirstOp;
-
-/// Matches `args` within `targetH` and unpacks a number of handles `N`.
-/// Assumes there are exactly `N` matched ops (but could be relaxed).
-/// Returns the tuple of handles.
-template <int N, typename... MatchingArgs>
-auto matchAndUnpack(ImplicitLocOpBuilder &b, Value targetH,
-                    MatchingArgs... args) {
-  Value matchedH = b.create<MatchOp>(targetH, args...);
-  auto matchOp = b.create<SplitHandleOp>(matchedH,
-                                         /*numHandles=*/N);
-  assert(matchOp->getNumResults() == N && "Unexpected number of results");
-  std::array<Value, N> a;
-  for (int64_t i = 0; i < N; ++i)
-    a[i] = matchOp->getResult(i);
-  return std::tuple_cat(a);
-}
-
-int64_t mlir::iree_compiler::previousMultipleOf(int64_t val, int64_t multiple) {
-  assert(val > 0 && "expected nonnegative val");
-  assert(multiple > 0 && "expected nonnegative multiple");
-  return (val / multiple) * multiple;
-}
-
-int64_t mlir::iree_compiler::nextMultipleOf(int64_t val, int64_t multiple) {
-  assert(val > 0 && "expected nonnegative val");
-  assert(multiple > 0 && "expected nonnegative multiple");
-  return ((val + multiple - 1) / multiple) * multiple;
-}
-
-FailureOr<int64_t>
-mlir::iree_compiler::maxDivisorOfValueBelowLimit(int64_t value, int64_t limit) {
-  // Conservatively return failure when `limit` is greater than 1024 to avoid
-  // prohibitively long compile time overheads.
-  // TODO: approximate with a faster implementation based on a few desirable
-  // primes.
-  if (limit > 1024)
-    return failure();
-  // If either value or limit is <= 0, the loop is skipped and we fail.
-  for (int64_t i = std::min(value, limit); i > 1; --i)
-    if (value % i == 0)
-      return i;
-  return failure();
-}
-
-void mlir::iree_compiler::createTransformRegion(
-    mlir::FunctionOpInterface entryPoint, StrategyBuilderFn buildStrategy) {
-  MLIRContext *ctx = entryPoint.getContext();
-  Location loc = entryPoint.getLoc();
-  OpBuilder b(ctx);
-  b.setInsertionPointAfter(entryPoint);
-  auto topLevelTransformModule = b.create<ModuleOp>(loc);
-  topLevelTransformModule->setAttr(
-      transform::TransformDialect::kWithNamedSequenceAttrName, b.getUnitAttr());
-  Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion();
-  b.setInsertionPointToStart(&topLevelTransformRegion.front());
-  auto anyOpType = transform::AnyOpType::get(b.getContext());
-  auto sequence = b.create<transform::NamedSequenceOp>(
-      loc,
-      /*symName=*/
-      std::string(
-          transform::TransformDialect::kTransformEntryPointSymbolName.str()),
-      /*rootType*/ anyOpType,
-      /*resultTypes=*/TypeRange{},
-      /*bodyBuilder=*/[&](OpBuilder &b, Location loc, Value variantH) {
-        ImplicitLocOpBuilder ib(loc, b);
-        buildStrategy(ib, variantH);
-        b.create<transform::YieldOp>(loc);
-      });
-  (void)sequence;
-
-  LDBG("transformation script:\n");
-  LDBG("verification: " << sequence.verify().succeeded() << "\n");
-}
-
-//===----------------------------------------------------------------------===//
-// Low-level reusable builder APIs, these should follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-
-/// Prints `handles` in order. Prints the whole IR if `handles` is empty.
-void mlir::iree_compiler::buildPrint(ImplicitLocOpBuilder &b,
-                                     ValueRange handles) {
-  if (handles.empty())
-    b.create<PrintOp>();
-  for (auto h : handles)
-    b.create<PrintOp>(h);
-}
-
-/// Create an ApplyPatternsOp that performs a set of key canonicalizations and
-/// so-called enabling transformations to normalize the IR.
-/// In addition to the specified transform, perform the following ones:
-///   tiling-related canonicalization patterns, canonicalization, licm and cse
-///   (in this order).
-void mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-    ImplicitLocOpBuilder &b, Value funcH,
-    ApplyPatternsOpBodyBuilderFn populatePatternsFn) {
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyTilingCanonicalizationPatternsOp>(loc);
-    b.create<IREE::transform_dialect::ApplyFoldFillIntoPadPatternsOp>(loc);
-    b.create<transform::ApplyForLoopCanonicalizationPatternsOp>(loc);
-    b.create<transform::ApplyCanonicalizationPatternsOp>(loc);
-    if (populatePatternsFn)
-      populatePatternsFn(b, loc);
-  });
-  b.create<IREE::transform_dialect::IREEApplyLoopIndependentCodeMotionOp>(
-      funcH);
-  b.create<mlir::transform::ApplyCommonSubexpressionEliminationOp>(funcH);
-}
-
-/// Dynamically selects the first non-empty handle; i.e. if (h1, h2) is:
-///   - (non-empty, non-empty), returns (h1, h2)
-///   - (empty, non-empty), returns (h2, empty)
-///   - (non-empty, empty), returns (h1, empty)
-///   - (empty, empty), returns (empty, empty)
-/// This is used as a normalization operation that replaces conditionals, either
-/// in C++ or in transform IR.
-/// This can be thought of as a control-flow -> data-dependent conversion.
-std::pair<Value, Value>
-mlir::iree_compiler::buildSelectFirstNonEmpty(ImplicitLocOpBuilder &b,
-                                              Value handle1, Value handle2) {
-  auto anyOpType = transform::AnyOpType::get(b.getContext());
-  auto selector = b.create<TakeFirstOp>(anyOpType, anyOpType,
-                                        ArrayRef<Value>{handle1, handle2});
-  return std::make_pair(selector.getFirst(), selector.getRest());
-}
-
-mlir::iree_compiler::TileToScfForAndFuseResult
-mlir::iree_compiler::buildTileFuseToScfFor(ImplicitLocOpBuilder &b,
-                                           Value variantH, Value rootH,
-                                           ValueRange opsHToFuse,
-                                           ArrayRef<OpFoldResult> tileSizes,
-                                           bool canonicalize) {
-  assert(opsHToFuse.empty() && "No fusion supported yet");
-  iree_compiler::TileToScfForAndFuseResult result;
-  auto tiletoScfForOp = b.create<transform::TileUsingForOp>(rootH, tileSizes);
-  result.forLoops = tiletoScfForOp.getLoops();
-  result.tiledOpH = tiletoScfForOp.getTiledLinalgOp();
-
-  // Perform a pass of canonicalization + enabling after tiling. Currently this
-  // folds away the extract slice on the iterator, breaking padding on aligned
-  // matmuls.
-  // TODO: Make padding less brittle so that this toggle is unnecessary.
-  if (canonicalize) {
-    Value funcH = b.create<transform::MatchOp>(
-        variantH, func::FuncOp::getOperationName());
-    mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  }
-  return result;
-}
-
-/// Performs the following transformations:
-///   1. Tiles `rootH` to scf.forall to with `tileSizesOrNumThreads`
-///      according to whether spec is a TileSizesSpec or a NumThreadsSpec.
-///   2. Maps the resulting scf.forall to threads according to
-///      `threadDimMapping`.
-///   3. Iterates over `opsHToFuse` in order and fuses into the containing op.
-/// Returns a handle to the resulting scf.forall.
-///
-/// Fusion operates in batch mode: a single fusion command is issued and a
-/// topological sort is automatically computed by the fusion.
-/// Since this applies a single fusion, no interleaved canonicalization / cse /
-/// enabling transformation occurs and the resulting fusion may not be as good.
-///
-/// In the future, an iterative mode in which the user is responsible for
-/// providing the fusion order and has interleaved canonicalization / cse /
-/// enabling transform will be introduced and may result in better fusions.
-///
-/// If `resultingFusedOpsHandles` is a non-null pointer, the fused operation are
-/// appended in order.
-// TODO: apply forwarding pattern.
-template <typename TileOrNumThreadSpec>
-static iree_compiler::TileToForallAndFuseAndDistributeResult
-buildTileAndFuseAndDistributeImpl(ImplicitLocOpBuilder &b, Value variantH,
-                                  Value rootH, ValueRange opsHToFuse,
-                                  ArrayRef<OpFoldResult> tileSizesOrNumThreads,
-                                  ArrayAttr threadDimMapping) {
-  iree_compiler::TileToForallAndFuseAndDistributeResult result;
-  auto tileToForeachOp = b.create<TileUsingForallOp>(
-      rootH, tileSizesOrNumThreads, TileOrNumThreadSpec(), threadDimMapping);
-
-  result.forallH = tileToForeachOp.getForallOp();
-  result.tiledOpH = tileToForeachOp.getTiledOp();
-
-  // Perform a pass of canonicalization + enabling after tiling.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-
-  // Batch fusion if requested.
-  if (opsHToFuse.size() > 1) {
-    Value mergedOpsH =
-        b.create<MergeHandlesOp>(opsHToFuse, /*deduplicate=*/true);
-    b.create<FuseIntoContainingOp>(mergedOpsH, result.forallH).getFusedOp();
-  } else if (opsHToFuse.size() == 1) {
-    Value fusedH =
-        b.create<FuseIntoContainingOp>(opsHToFuse.front(), result.forallH)
-            .getFusedOp();
-    result.resultingFusedOpsHandles.push_back(fusedH);
-  }
-  return result;
-}
-
-// TODO: if someone knows how to properly export templates go for it ..
-// sigh.
-iree_compiler::TileToForallAndFuseAndDistributeResult
-mlir::iree_compiler::buildTileFuseDistToForallWithTileSizes(
-    ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse,
-    ArrayRef<OpFoldResult> tileSizes, ArrayAttr threadDimMapping) {
-  return buildTileAndFuseAndDistributeImpl<transform::TileSizesSpec>(
-      b, variantH, rootH, opsHToFuse, tileSizes, threadDimMapping);
-}
-
-/// Call buildTileAndFuseAndDistributeImpl with ArrayRef<int64_t> numThreads.
-// TODO: if someone knows how to properly export templates go for it ..
-// sigh.
-iree_compiler::TileToForallAndFuseAndDistributeResult
-mlir::iree_compiler::buildTileFuseDistToForallWithNumThreads(
-    ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse,
-    ArrayRef<OpFoldResult> numThreads, ArrayAttr threadDimMapping) {
-  return buildTileAndFuseAndDistributeImpl<transform::NumThreadsSpec>(
-      b, variantH, rootH, opsHToFuse, numThreads, threadDimMapping);
-}
-
-/// Build the transform IR to pad an op `opH`.
-// TODO: Better upstream builder.
-Value mlir::iree_compiler::buildPad(
-    ImplicitLocOpBuilder &b, Value opH, ArrayRef<Attribute> paddingValues,
-    ArrayRef<int64_t> paddingDimensions, ArrayRef<int64_t> packingDimensions,
-    ArrayRef<SmallVector<int64_t>> transposePaddings) {
-  SmallVector<int64_t> staticPadToMultipleOf(paddingDimensions.size(), 1);
-  SmallVector<Attribute> transposeAttrs;
-  for (auto &transp : transposePaddings)
-    transposeAttrs.push_back(b.getI64ArrayAttr(transp));
-
-  Type resultTypes[] = {opH.getType(),
-                        transform::AnyOpType::get(b.getContext()),
-                        transform::AnyOpType::get(b.getContext())};
-  return b
-      .create<transform::PadOp>(
-          resultTypes, opH, b.getArrayAttr(paddingValues),
-          b.getI64ArrayAttr(paddingDimensions),
-          /*padToMultipleOf=*/ValueRange{}, staticPadToMultipleOf,
-          b.getI64ArrayAttr(packingDimensions), b.getArrayAttr(transposeAttrs),
-          /*copyBack=*/b.getStringAttr("none"))
-      ->getResult(0);
-}
-
-/// Apply patterns and vectorize.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-// TODO: configure patterns.
-Value mlir::iree_compiler::buildVectorize(ImplicitLocOpBuilder &b, Value funcH,
-                                          bool applyCleanups,
-                                          bool vectorizePadding,
-                                          bool vectorizeNdExtract) {
-  funcH = b.create<VectorizeChildrenAndApplyPatternsOp>(funcH, vectorizePadding,
-                                                        vectorizeNdExtract);
-  if (applyCleanups) {
-    iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  }
-  return funcH;
-}
-
-void mlir::iree_compiler::buildLowerMaskedTransfersAndCleanup(
-    ImplicitLocOpBuilder &b, Value funcH, bool cleanup) {
-  // TODO: avoid functional style transform so we can apply to the variant.
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerMaskedTransfersPatternsOp>(loc);
-  });
-  if (cleanup) {
-    b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-      b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
-      b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
-      b.create<IREE::transform_dialect::
-                   ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
-    });
-  }
-}
-
-Value mlir::iree_compiler::buildLowerVectorMasksAndCleanup(
-    ImplicitLocOpBuilder &b, Value funcH, bool cleanup) {
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyLowerMasksPatternsOp>(loc);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyMaterializeMasksPatternsOp>(loc);
-  });
-  if (cleanup) {
-    iree_compiler::buildCanonicalizationAndEnablingTransforms(
-        b, funcH, [](OpBuilder &b, Location loc) {
-          b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-        });
-  }
-  return funcH;
-}
-
-/// Hoist redundant subet ops.
-void mlir::iree_compiler::buildHoisting(ImplicitLocOpBuilder &b, Value funcH) {
-  Value loops =
-      b.create<transform::MatchOp>(funcH, scf::ForOp::getOperationName());
-  b.create<HoistLoopInvariantSubsetsOp>(loops);
-}
-
-/// Bufferize and drop HAL descriptor from memref ops.
-Value mlir::iree_compiler::buildBufferize(ImplicitLocOpBuilder &b,
-                                          Value variantH, bool targetGpu) {
-  // Perform a pass of canonicalization + enabling before bufferization to avoid
-  // spurious allocations.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyReassociativeReshapeFoldingPatternsOp>(loc);
-        b.create<IREE::transform_dialect::
-                     ApplyFoldTensorSliceIntoTransferPatternsOp>(loc);
-      });
-  b.create<IREEEliminateEmptyTensorsOp>(funcH);
-  variantH = b.create<IREEBufferizeOp>(funcH, targetGpu);
-  return variantH;
-}
-
-namespace {
-/// Various handles produced by reduction splitting.
-struct ReductionSplitResult {
-  /// Handle to the leading elementwise operation, may be null if no such
-  /// operation is present.
-  Value leadingEltwiseH;
-  /// Handle to the fill operation feeding the init of a higher-rank
-  /// more-parallel reduction.
-  Value splitFillH;
-  /// Handle to the higher-rank more-parallel reduction.
-  Value splitLinalgH;
-  /// Handle to the final reduction.
-  Value combinerH;
-  /// Handle to the original fill operation, may be null if the operation
-  /// was not re-matched.
-  Value originalFillH;
-  /// Handle to the trailing fill operation, may be null if the operation
-  /// was not re-matched.
-  Value trailingEltwiseH;
-};
-} // namespace
-
-/// Build transform IR to split the reduction into a parallel and combiner part.
-/// Then tile the parallel part and map it to `tileSize` threads, each reducing
-/// on `vectorSize` elements.
-/// Lastly, fuse the newly created fill and elementwise operations into the
-/// resulting containing forall op.
-/// Return a triple of handles to (forall, fill, combiner)
-std::tuple<Value, Value, Value>
-mlir::iree_compiler::buildTileReductionUsingScfForeach(
-    ImplicitLocOpBuilder &b, Value isolatedParentOpH, Value reductionH,
-    int64_t reductionRank, int64_t tileSize, int64_t reductionVectorSize,
-    Attribute mappingAttr) {
-  SmallVector<int64_t> leadingParallelDims(reductionRank - 1, 0);
-  SmallVector<int64_t> numThreads = leadingParallelDims;
-  numThreads.push_back(tileSize);
-  SmallVector<int64_t> tileSizes = leadingParallelDims;
-  tileSizes.push_back(reductionVectorSize);
-  auto tileReduction = b.create<transform::TileReductionUsingForallOp>(
-      /*target=*/reductionH,
-      /*numThreads=*/numThreads,
-      /*tileSizes=*/tileSizes,
-      /*threadDimMapping=*/b.getArrayAttr(mappingAttr));
-  Value blockParallelForallOp = tileReduction.getForallOp();
-  Value blockParallelFillH = tileReduction.getFillOp().front();
-  Value blockCombinerOpH = tileReduction.getCombiningLinalgOp();
-  // Fuse the fill and elementwise to privatize them.
-  blockParallelFillH =
-      b.create<FuseIntoContainingOp>(blockParallelFillH, blockParallelForallOp)
-          .getFusedOp();
-  return std::make_tuple(blockParallelForallOp, blockParallelFillH,
-                         blockCombinerOpH);
-}
-
-std::tuple<Value, Value, Value, Value, Value>
-mlir::iree_compiler::buildReductionStrategyBlockDistribution(
-    ImplicitLocOpBuilder &b, Value variantH,
-    ArrayRef<int64_t> workgroupTileSizes) {
-  // Step 1. Call the matcher. Note that this is the same matcher as used to
-  // trigger this compilation path, so it must always apply.
-  b.create<RegisterMatchCallbacksOp>();
-  auto [maybeLeadingH, fillH, reductionH, maybeTrailingH] =
-      unpackRegisteredMatchCallback<4>(
-          b, "reduction", transform::FailurePropagationMode::Propagate,
-          variantH);
-  // Step 2. Create the block/mapping tiling level and fusee.
-  auto [fusionTargetH, fusionGroupH] =
-      buildSelectFirstNonEmpty(b, maybeTrailingH, reductionH);
-  MLIRContext *ctx = b.getContext();
-  SmallVector<Attribute> blockDimMapping{blockX(ctx), blockY(ctx), blockZ(ctx)};
-  blockDimMapping.resize(workgroupTileSizes.size());
-  TileToForallAndFuseAndDistributeResult tileResult =
-      buildTileFuseDistToForallWithTileSizes(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/fusionTargetH,
-          /*opsToFuseH=*/fusionGroupH,
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(workgroupTileSizes)),
-          /*threadDimMapping=*/b.getArrayAttr(blockDimMapping));
-
-  // Handle the workgroup count region.
-  b.create<PopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
-      tileResult.forallH);
-
-  fillH =
-      b.create<FuseIntoContainingOp>(fillH, tileResult.forallH).getFusedOp();
-  maybeLeadingH =
-      b.create<FuseIntoContainingOp>(maybeLeadingH, tileResult.forallH)
-          .getFusedOp();
-
-  // Perform a pass of canonicalization + enabling after fusion.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-
-  // Step 3. Normalize to reorder results irrespective of emptiness.
-  auto [blockReductionH, maybeBlockTrailingH] = buildSelectFirstNonEmpty(
-      b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH);
-  return std::make_tuple(maybeLeadingH, fillH, blockReductionH,
-                         maybeBlockTrailingH, tileResult.forallH);
-}
-
-Value mlir::iree_compiler::buildMemoryOptimizations(ImplicitLocOpBuilder &b,
-                                                    Value funcH) {
-  // Apply canonicalizations and enablings twice as they enable each other.
-  for (int i = 0; i < 2; ++i) {
-    buildCanonicalizationAndEnablingTransforms(
-        b, funcH, [](OpBuilder &b, Location loc) {
-          b.create<transform::ApplyTransferPermutationPatternsOp>(loc);
-          b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
-        });
-  }
-  b.create<MemRefEraseDeadAllocAndStoresOp>(funcH);
-  return funcH;
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h
deleted file mode 100644
index 482ee2257680..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_
-
-#include "mlir/Interfaces/FunctionInterfaces.h"
-// Needed until IREE builds its own gpu::GPUBlockMappingAttr / gpu::Blocks
-// attributes that are reusable across all targets.
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/IR/BuiltinOps.h"
-
-namespace mlir::iree_compiler {
-
-//===----------------------------------------------------------------------===//
-// Base quantities generally useful for all CPU and GPU strategies.
-//===----------------------------------------------------------------------===//
-inline Attribute blockX(MLIRContext *ctx) {
-  return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimX);
-}
-inline Attribute blockY(MLIRContext *ctx) {
-  return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimY);
-}
-inline Attribute blockZ(MLIRContext *ctx) {
-  return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ);
-}
-
-struct AbstractReductionStrategy;
-
-//===----------------------------------------------------------------------===//
-// General helpers.
-//===----------------------------------------------------------------------===//
-
-/// Return the greatest value smaller or equal to `val` that is a multiple
-/// of `multiple`. Asserts that all quantities are nonnegative. I.e. returns
-/// `(val / multiple) * multiple` a.k.a `floordiv(val, multiple) * multiple`.
-int64_t previousMultipleOf(int64_t val, int64_t multiple);
-
-/// Return the smallest value greater or equal to `val` that is a multiple of
-/// `multiple`. Asserts that all quantities are nonnegative.
-/// I.e. returns `((val + multiple - 1) / multiple) * multiple`  a.k.a
-///        a.k.a `ceildiv(val, multiple) * multiple`.
-int64_t nextMultipleOf(int64_t val, int64_t multiple);
-
-/// Find the highest divisor of `value` that is smaller than `limit`. This is
-/// useful to capture any tiling that is guaranteed to keep the IR static.
-/// Conservatively return failure when `limit` is greater than 1024 to avoid
-/// prohibitively long compile time overheads.
-// TODO: approximate with a faster implementation based on a few desirable
-// primes.
-FailureOr<int64_t> maxDivisorOfValueBelowLimit(int64_t value, int64_t limit);
-
-using StrategyBuilderFn = std::function<void(ImplicitLocOpBuilder &, Value)>;
-
-/// Use `buildStrategy` to build a ModuleOp containing transform dialect IR,
-/// right after function `entryPoint`.
-/// This embed the transform into the IR and allows applying it either in debug
-/// mode or within the IREE pipeline.
-void createTransformRegion(mlir::FunctionOpInterface entryPoint,
-                           StrategyBuilderFn buildStrategy);
-
-//===----------------------------------------------------------------------===//
-// Low-level reusable builder APIs, these should follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-
-/// Build transform IR that prints `handles` in order, or print the whole IR if
-/// `handles` is empty.
-void buildPrint(ImplicitLocOpBuilder &b, ValueRange handles = {});
-
-using ApplyPatternsOpBodyBuilderFn = std::function<void(OpBuilder &, Location)>;
-
-/// Create an ApplyPatternsOp that performs a set of key canonicalizations and
-/// so-called enabling transformations to normalize the IR.
-/// In addition to the specified transform, perform the following ones:
-///   canonicalization, tiling_canonicalization, licm and cse (in this order).
-void buildCanonicalizationAndEnablingTransforms(
-    ImplicitLocOpBuilder &b, Value funcH,
-    ApplyPatternsOpBodyBuilderFn populatePatternsFn = nullptr);
-
-/// Build transform IR to dynamically selects the first non-empty handle; i.e.
-/// if (h1, h2) is:
-///   - (non-empty, non-empty), returns (h1, h2)
-///   - (empty, non-empty), returns (h2, empty)
-///   - (non-empty, empty), returns (h1, empty)
-///   - (empty, empty), returns (empty, empty)
-/// This is used as a normalization operation that replaces conditionals, either
-/// in C++ or in transform IR.
-/// This can be thought of as a control-flow -> data-dependent conversion.
-std::pair<Value, Value> buildSelectFirstNonEmpty(ImplicitLocOpBuilder &b,
-                                                 Value handle1, Value handle2);
-
-/// Result of the combined transform performing tiling, fusion and
-/// distribution to parallel constructs.
-struct TileToScfForAndFuseResult {
-  /// Vector of `scf.for` loops containing the tiled and fused operations.
-  SmallVector<Value> forLoops;
-  /// Handles to fused operations other than the final consumer operation. May
-  /// be empty if fusion was not performed iteratively.
-  /// This is currently empty
-  // TODO: support returning handles from `fuse_into_containing_op` and remove
-  // the restriction above.
-  SmallVector<Value> resultingFusedOpsHandles;
-  /// Handle to the tiled final consumer operation.
-  Value tiledOpH;
-};
-
-/// Build transform IR to perform multi-level tile and fuse into an scf.for op.
-/// Note: fusion is currently unsupported.
-TileToScfForAndFuseResult
-buildTileFuseToScfFor(ImplicitLocOpBuilder &b, Value variantH, Value rootH,
-                      ValueRange opsHToFuse, ArrayRef<OpFoldResult> tileSizes,
-                      bool canonicalize = true);
-
-/// Result of the combined transform performing tiling, fusion and
-/// distribution to parallel constructs.
-struct TileToForallAndFuseAndDistributeResult {
-  /// Outer `scf.forall` loop containing the tiled and fused
-  /// operations.
-  Value forallH;
-  /// Handles to fused operations other than the final consumer operation. May
-  /// be empty if fusion was not performed iteratively.
-  // TODO: support returning handles from `fuse_into_containing_op` and remove
-  // the restriction above.
-  SmallVector<Value> resultingFusedOpsHandles;
-  /// Handle to the tiled final consumer operation.
-  Value tiledOpH;
-};
-
-/// Build transform IR to perform the following transformations:
-///   1. Tiles `rootH` to scf.forall to with `tileSizesOrNumThreads`
-///      according to whether spec is a TileSizesSpec or a NumThreadsSpec.
-///   2. Maps the resulting scf.forall to threads according to
-///      `threadDimMapping`.
-///   3. Iterates over `opsHToFuse` in order and fuses into the containing op.
-///
-/// Fusion operates in batch mode: a single fusion command is issued and a
-/// topological sort is automatically computed by the fusion.
-/// Since this applies a single fusion, no interleaved canonicalization / cse
-/// / enabling transformation occurs and the resulting fusion may not be as
-/// good.
-///
-/// In the future, an iterative mode in which the user is responsible for
-/// providing the fusion order and has interleaved canonicalization / cse /
-/// enabling transform will be introduced and may result in better fusions.
-///
-/// Note: this version cannot be used for the block-level tiling in a dispatch
-/// region. `buildTileFuseDistToForallAndWorkgroupCountWithTileSizes` is
-/// the modified version that is aware of the `workgroup_count` region.
-///
-// TODO: if someone knows how to properly export templates go for it .. sigh.
-TileToForallAndFuseAndDistributeResult buildTileFuseDistToForallWithTileSizes(
-    ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse,
-    ArrayRef<OpFoldResult> tileSizes, ArrayAttr threadDimMapping);
-
-/// Similar to `buildTileFuseDistWithTileSizes` but using `numThreads` instead
-/// of `tileSizes`.
-TileToForallAndFuseAndDistributeResult buildTileFuseDistToForallWithNumThreads(
-    ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse,
-    ArrayRef<OpFoldResult> numThreads, ArrayAttr threadDimMapping);
-
-/// Build transform IR to split the reduction into a parallel and combiner part.
-/// Then tile the parallel part and map it to `tileSize` threads, each reducing
-/// on `vectorSize` elements.
-/// Lastly, fuse the newly created fill and elementwise operations into the
-/// resulting containing forall op.
-/// Return a triple of handles to (forall, fill, combiner)
-std::tuple<Value, Value, Value> buildTileReductionUsingScfForeach(
-    ImplicitLocOpBuilder &b, Value isolatedParentOpH, Value reductionH,
-    int64_t reductionRank, int64_t tileSize, int64_t reductionVectorSize,
-    Attribute mappingAttr);
-
-/// Build the transform IR to pad an op `opH`.
-// TODO: Better upstream builder.
-Value buildPad(ImplicitLocOpBuilder &b, Value opH,
-               ArrayRef<Attribute> paddingValues,
-               ArrayRef<int64_t> paddingDimensions,
-               ArrayRef<int64_t> packingDimensions,
-               ArrayRef<SmallVector<int64_t>> transposePaddings = {});
-
-/// Build transform IR that applies rank-reduction patterns and vectorizes.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-/// If `applyCleanups` is true, also apply cleanup patterns.
-Value buildVectorize(ImplicitLocOpBuilder &b, Value funcH,
-                     bool applyCleanups = false, bool vectorizePadding = false,
-                     bool vectorizeNdExtract = false);
-
-/// Build transform IR that applies lowering of masked vector transfer
-/// operations and subsequent cleanup patterns (fold-memref-aliases).
-/// Takes a handle to a containing op and returns an updated handle to the
-/// containing op.
-void buildLowerMaskedTransfersAndCleanup(ImplicitLocOpBuilder &b, Value funcH,
-                                         bool cleanup = true);
-
-/// Build transform IR that applies vector mask lowering and subsequent cleanup
-/// patterns (fold-memref-aliases).
-/// Takes a handle to a containing op and returns an updated handle to the
-/// containing op.
-Value buildLowerVectorMasksAndCleanup(ImplicitLocOpBuilder &b, Value funcH,
-                                      bool cleanup = true);
-
-/// Build transform IR to hoist redundant subset operations.
-void buildHoisting(ImplicitLocOpBuilder &b, Value funcH);
-
-/// Build transform IR to bufferize and drop HAL descriptor from memref ops.
-/// Takes a handle variantOp and returns a handle to the same variant op.
-Value buildBufferize(ImplicitLocOpBuilder &b, Value variantH,
-                     bool targetGpu = false);
-
-//===----------------------------------------------------------------------===//
-// Higher-level problem-specific strategy creation APIs, these should favor
-// user-friendliness.
-//===----------------------------------------------------------------------===//
-
-/// Build transform IR to match exactly an N-D reduction operation (with
-/// optional leading and trailing elementwise) and create a top-level
-/// `scf.forall` tiled by `strategy.workgroupTileSizes`.
-/// The matched `maybeLeadingH`, `fillH`, `reductionH` and `maybeTrailingH` are
-/// fused into the top-level `scf.forall` and handles are returned to
-/// the fused versions of these ops, in order, that are all tiled and
-/// distributed accordingly. The scf.forall is returned as the last
-/// value.
-/// The mapping of the `scf.forall` dimensions is tied the first
-/// dimensions of `strategy.allBlockAttrs`.
-///
-/// Note: `buildTileFuseDistToForallAndWorkgroupCountWithTileSizes` is
-/// called internally, this version is only for the block-level tiling inside a
-/// dispatch region with an attached workgroup_count region.
-///
-/// Note: the matching is enforced to be exact (i.e. no other compute ops may
-/// exist under variantH). This is consistent with application confined within
-/// the dispatch region, where we must not miss any op.
-///
-/// Note: A future version of this op will be able to directly apply on the DAG
-/// and form the dispatch region.
-std::tuple<Value, Value, Value, Value, Value>
-buildReductionStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH,
-                                        ArrayRef<int64_t> workgroupTileSizes);
-
-/// Build transform IR that applies memory optimizations.
-Value buildMemoryOptimizations(ImplicitLocOpBuilder &b, Value funcH);
-
-} // namespace mlir::iree_compiler
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp
deleted file mode 100644
index fcba3716fd15..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <numeric>
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-/// Options to set the default values of the matmul strategy.
-
-static llvm::cl::list<int64_t> clBlockTileSizes(
-    "td-matmul-strategy-blk-sizes",
-    llvm::cl::desc("block tile size for dims (x,y,z) for the transform "
-                   "dialect matmul strategy"),
-    llvm::cl::CommaSeparated);
-static llvm::cl::opt<int64_t> clReductionTileSize(
-    "td-matmul-strategy-reduc-size",
-    llvm::cl::desc(
-        "reduction tile sized for the transform dialect matmul strategy"));
-static llvm::cl::list<int64_t> clNumThreads(
-    "td-matmul-strategy-num-threads",
-    llvm::cl::desc("number of threads for dims (x,y,z) for the transform "
-                   "dialect matmul strategy"),
-    llvm::cl::CommaSeparated);
-static llvm::cl::list<int64_t> clNumWarps(
-    "td-matmul-strategy-num-warps",
-    llvm::cl::desc("number of warps for dims (x,y,z) for the transform "
-                   "dialect matmul strategy"),
-    llvm::cl::CommaSeparated);
-static llvm::cl::opt<bool> clUseAsyncCopies(
-    "td-matmul-strategy-use-async-copies",
-    llvm::cl::desc(
-        "use asynchronous copies for the transform dialect matmul strategy"));
-static llvm::cl::opt<bool> clUseMmaSync(
-    "td-matmul-strategy-use-mma-sync",
-    llvm::cl::desc("use mma sync for the transform dialect matmul strategy"));
-static llvm::cl::opt<bool> clUseWmma(
-    "td-matmul-strategy-use-wmma",
-    llvm::cl::desc("use wmma for the transform dialect matmul strategy"));
-static llvm::cl::opt<bool> clUseFma(
-    "td-matmul-strategy-use-fma",
-    llvm::cl::desc("use fma for the transform dialect matmul strategy"));
-static llvm::cl::opt<int64_t> clPipelineDepth(
-    "td-matmul-strategy-pipeline-depth",
-    llvm::cl::desc("pipeline depth for the transform dialect matmul strategy"));
-static llvm::cl::opt<bool> clPeelPipelineEpilogue(
-    "td-matmul-strategy-peel-pipeline-epilogue",
-    llvm::cl::desc("whether to peel the pipeline epilogue for the transform "
-                   "dialect matmul strategy"));
-
-using iree_compiler::gpu::AbstractGemmLikeStrategy;
-
-/// Key function for vtable.
-AbstractGemmLikeStrategy::~AbstractGemmLikeStrategy() {}
-
-void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) {
-  blockTileSizes =
-      SmallVector<int64_t>{clBlockTileSizes.begin(), clBlockTileSizes.end()};
-  numThreads = SmallVector<int64_t>{clNumThreads.begin(), clNumThreads.end()};
-  numWarps = SmallVector<int64_t>{clNumWarps.begin(), clNumWarps.end()};
-  reductionTileSize = clReductionTileSize;
-  useAsyncCopies = clUseAsyncCopies;
-  useMmaSync = clUseMmaSync;
-  useWmma = clUseWmma;
-  useFma = clUseFma;
-  pipelineDepth = clPipelineDepth;
-  peelPipelineEpilogue = clPeelPipelineEpilogue;
-
-  /// cliOptionsSpecified is used to override hard-coded well known good
-  /// defaults when set.
-  if (clBlockTileSizes.getNumOccurrences() ||
-      clNumThreads.getNumOccurrences() || clNumWarps.getNumOccurrences() ||
-      clReductionTileSize.getNumOccurrences() ||
-      clUseAsyncCopies.getNumOccurrences() ||
-      clUseMmaSync.getNumOccurrences() || clUseWmma.getNumOccurrences() ||
-      clUseFma.getNumOccurrences() || clPipelineDepth.getNumOccurrences() ||
-      clPeelPipelineEpilogue.getNumOccurrences()) {
-    cliOptionsSpecified = true;
-  }
-
-  /// If not specified, select instructions to target for compute.
-  if (!useMmaSync && !useWmma && !useFma) {
-    /// First, try to use tensor core.
-    if (getLhsElementalType() == getRhsElementalType()) {
-      /// Currently all supported targets at least have WMMA.
-      /// TODO: Handle targets without tensor core.
-      if (gpuModel.hasMmaSync)
-        useMmaSync = true;
-      else
-        useWmma = true;
-    } else {
-      /// Mixed precision only supported by fma.
-      useFma = true;
-    }
-  }
-
-  /// Prefer smaller subgroup sizes for tensor core strategies.
-  if (!useFma)
-    targetSubgroupSize = gpuModel.minSubgroupSize;
-
-  /// Default configuration based on hardware properties and problem bit widths.
-  if (clBlockTileSizes.getNumOccurrences()) {
-    blockTileSizes =
-        SmallVector<int64_t>(clBlockTileSizes.begin(), clBlockTileSizes.end());
-  } else {
-    blockTileSizes = SmallVector<int64_t>{128, 128, 1};
-  }
-
-  if (clNumThreads.getNumOccurrences()) {
-    numThreads = SmallVector<int64_t>(clNumThreads.begin(), clNumThreads.end());
-  } else {
-    // Infer from warp counts if present.
-    if (clNumWarps.getNumOccurrences()) {
-      numThreads = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
-      numThreads[0] *= getSubgroupSize();
-    } else {
-      numThreads = SmallVector<int64_t>{64, 2, 1};
-    }
-  }
-  if (clNumWarps.getNumOccurrences()) {
-    numWarps = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
-  } else {
-    numWarps = numThreads;
-    numWarps[0] = llvm::divideCeil(numWarps[0], getSubgroupSize());
-  }
-  if (clUseAsyncCopies.getNumOccurrences())
-    useAsyncCopies = clUseAsyncCopies;
-  else
-    useAsyncCopies = gpuModel.hasMmaSync;
-  if (clUseMmaSync.getNumOccurrences())
-    useMmaSync = clUseMmaSync;
-  if (clUseWmma.getNumOccurrences())
-    useWmma = clUseWmma;
-  if (clUseFma.getNumOccurrences())
-    useFma = clUseFma;
-  if (clReductionTileSize.getNumOccurrences()) {
-    reductionTileSize = clReductionTileSize;
-  } else {
-    reductionTileSize = 16;
-    if (!useFma) {
-      int64_t maxInputWidth =
-          std::max(lhsElementalBitWidth(), rhsElementalBitWidth());
-      assert(maxInputWidth <= 32 && "requires <= 32-bit types");
-      reductionTileSize *= (32 / maxInputWidth);
-    }
-  }
-  if (clPipelineDepth.getNumOccurrences()) {
-    pipelineDepth = clPipelineDepth;
-  } else {
-    pipelineDepth = 0;
-    if (useAsyncCopies)
-      pipelineDepth = 3;
-  }
-}
-
-ArrayAttr
-AbstractGemmLikeStrategy::getZeroPadAttrFromElementalTypes(OpBuilder &b) const {
-  SmallVector<Attribute> paddingValues;
-  for (Type t : paddingValueTypes)
-    paddingValues.push_back(b.getZeroAttr(t));
-  return b.getArrayAttr(paddingValues);
-}
-
-//===--------------------------------------------------------------------===//
-// Validation of support for the configured strategy.
-//===--------------------------------------------------------------------===//
-
-LogicalResult
-AbstractGemmLikeStrategy::validate(const GPUModel &gpuModel) const {
-  if (totalNumThreads() != totalNumWarps() * getSubgroupSize()) {
-    llvm::errs() << "Number of threads specified by warps must match total "
-                    "number of threads\n";
-    return failure();
-  }
-  if (m() < blockTileM()) {
-    llvm::errs() << "m(" << m() << ") < blockTileM(" << blockTileM() << ") ";
-    llvm::errs() << "this is at risk of not vectorizing and is NYI";
-    return failure();
-  }
-  if (n() < blockTileN()) {
-    llvm::errs() << "n(" << n() << ") < blockTileN(" << blockTileN() << ") ";
-    llvm::errs() << "this is at risk of not vectorizing and is NYI";
-    return failure();
-  }
-  if (k() < reductionTileSize) {
-    llvm::errs() << "k(" << k() << ") < reductionTileSize(" << reductionTileSize
-                 << ") ";
-    llvm::errs() << "this is at risk of not vectorizing and is NYI";
-    return failure();
-  }
-
-  if (failed(validateLhsCopyMapping())) {
-    llvm::errs() << "invalid lhs copy mapping";
-    return failure();
-  }
-  if (failed(validateRhsCopyMapping())) {
-    llvm::errs() << "invalid rhs copy mapping";
-    return failure();
-  }
-  if (failed(validateResCopyMapping())) {
-    llvm::errs() << "invalid res copy mapping";
-    return failure();
-  }
-
-  if (pipelineDepth > 1 && reductionTileSize * pipelineDepth > k()) {
-    llvm::errs() << "pipeline depth " << pipelineDepth
-                 << " too large for reduction tile size " << reductionTileSize
-                 << " given k " << k();
-    return failure();
-  }
-
-  bool oneOption =
-      (useMmaSync ^ useWmma ^ useFma) && !(useMmaSync && useWmma && useFma);
-  if (!oneOption) {
-    llvm::errs() << "at most one of useMmaSync, useWmma, useFma can be true";
-    return failure();
-  }
-
-  if (useMmaSync) {
-    if (blockTileM() < kMinMmaSyncMinM) {
-      llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinM
-                   << " block tile size in M";
-      return failure();
-    }
-    if (blockTileN() < kMinMmaSyncMinN) {
-      llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinN
-                   << " block tile size in N";
-      return failure();
-    }
-    if (reductionTileSize < kMinMmaSyncMinK) {
-      llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinK
-                   << " block tile size in K";
-      return failure();
-    }
-    if (pipelineDepth > 1 && pipelineDepth < kMinMmaSyncPipelineDepth) {
-      llvm::errs() << "mma.sync pipelining requires at least "
-                   << kMinMmaSyncPipelineDepth << " stages";
-      return failure();
-    }
-    if (pipelineDepth > 1 && reductionTileSize * kMinMmaSyncGroups > k()) {
-      llvm::errs() << "mma.sync pipelining requires at least "
-                   << kMinMmaSyncGroups << " k groups";
-      return failure();
-    }
-  } else if (useWmma) {
-    if (blockTileM() < kMinWmmaMinM) {
-      llvm::errs() << "wmma requires at least " << kMinWmmaMinM
-                   << " block tile size in M";
-      return failure();
-    }
-    if (blockTileN() < kMinWmmaMinN) {
-      llvm::errs() << "wmma requires at least " << kMinWmmaMinN
-                   << " block tile size in N";
-      return failure();
-    }
-    if (reductionTileSize < kMinWmmaMinK) {
-      llvm::errs() << "wmma requires at least " << kMinWmmaMinK
-                   << " block tile size in K";
-      return failure();
-    }
-  }
-  return success();
-}
-
-//===--------------------------------------------------------------------===//
-// Strategy printing for debugging.
-//===--------------------------------------------------------------------===//
-
-LLVM_DUMP_METHOD void AbstractGemmLikeStrategy::dump() const {
-  print(llvm::errs());
-}
-
-void AbstractGemmLikeStrategy::print(llvm::raw_ostream &os) const {
-  os << "- forced by CLI specification: "
-     << (cliOptionsSpecified ? "true" : "false") << "\n";
-  os << "- block tile sizes: {";
-  bool isFirst = true;
-  for (int64_t blockTileSize : blockTileSizes) {
-    if (!isFirst)
-      os << ", ";
-    os << blockTileSize;
-    isFirst = false;
-  }
-  os << "}\n";
-  os << "- reduction tile size: " << reductionTileSize << '\n';
-
-  os << "- number of threads: {";
-  isFirst = true;
-  for (int64_t numThreadsForDim : numThreads) {
-    if (!isFirst)
-      os << ", ";
-    os << numThreadsForDim;
-    isFirst = false;
-  }
-  os << "}\n";
-
-  os << "- number of warps: {";
-  isFirst = true;
-  for (int64_t numWarpsForDim : numWarps) {
-    if (!isFirst)
-      os << ", ";
-    os << numWarpsForDim;
-    isFirst = false;
-  }
-  os << "}\n";
-  os << "- use async copies: " << useAsyncCopies << '\n';
-  os << "- use fma: " << useFma << '\n';
-  os << "- use wmma: " << useWmma << '\n';
-  os << "- use mma sync: " << useMmaSync << '\n';
-  os << "- pipeline depth: " << pipelineDepth << '\n';
-
-  os << "\n-- Derived quantities --\n";
-  os << "- lhs copy:\n";
-  lhsCopyMapping().print(os << "    -> ");
-  os << "\n- rhs copy:\n";
-  rhsCopyMapping().print(os << "    -> ");
-  os << "\n- res copy:\n";
-  resCopyMapping().print(os << "    -> ");
-  os << "\n";
-}
-
-/// Validates the mapping and emits a diagnostic on failure.
-LogicalResult AbstractGemmLikeStrategy::validateCopyMapping(
-    MLIRContext *ctx, const MappingInfo &mapping, StringRef name) const {
-  int64_t threadsUsed =
-      std::accumulate(mapping.numThreads.begin(), mapping.numThreads.end(), 1,
-                      std::multiplies<int64_t>());
-  if (totalNumThreads() < threadsUsed) {
-    InFlightDiagnostic diag = emitError(UnknownLoc::get(ctx))
-                              << "too many threads used for transferring "
-                              << name;
-
-    std::string str;
-    llvm::raw_string_ostream os(str);
-    llvm::interleave(mapping.numThreads, os, " * ");
-    os << " >= " << totalNumThreads();
-    diag.attachNote() << os.str();
-    return diag;
-  }
-
-  return success();
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
deleted file mode 100644
index ed7af71bb93b..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_
-
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct AbstractGemmLikeStrategy : GPUStrategy {
-  AbstractGemmLikeStrategy(const GPUModel &gpuModel) : GPUStrategy(gpuModel) {}
-
-  virtual ~AbstractGemmLikeStrategy();
-
-  //===--------------------------------------------------------------------===//
-  // Helpers and parameters for configuring the strategy.
-  //===--------------------------------------------------------------------===//
-
-  /// Initialize values from the CLI. Set cliOptionsSpecified to true if the
-  /// default CLI values have been overriden.
-  virtual void initDefaultValues(const GPUModel &gpuModel);
-
-  /// Encodes whether the user has specified any CLI options. When true, the
-  /// strategy should just run what was specified and is not allowed to
-  /// override the user's choices.
-  bool cliOptionsSpecified = false;
-
-  /// Non-default subgroup size to use configured based on hardware supported
-  /// values.
-  std::optional<int64_t> targetSubgroupSize = std::nullopt;
-
-  int64_t getSubgroupSize() const {
-    return targetSubgroupSize ? *targetSubgroupSize : subgroupSize;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Parameters that control the tiling and mapping.
-  //===--------------------------------------------------------------------===//
-
-  /// Tile sizes for the workgroup / determines grid size for all known
-  /// reduction strategies. The initial values are set by initDefaultValues();
-  SmallVector<int64_t> blockTileSizes;
-  int64_t reductionTileSize;
-  SmallVector<int64_t> numThreads;
-  SmallVector<int64_t> numWarps;
-  virtual int64_t blockTileM() const = 0;
-  virtual int64_t blockTileN() const = 0;
-
-  virtual int64_t numWarpsX() const = 0;
-  virtual int64_t numWarpsY() const = 0;
-
-  virtual MappingInfo getBlockMapping() const = 0;
-
-  /// Common values based on derived quantities.
-  int64_t totalNumThreads() const {
-    int64_t res = 1;
-    for (auto v : numThreads)
-      res *= v;
-    return res;
-  }
-
-  int64_t totalNumWarps() const {
-    int64_t res = 1;
-    for (auto v : numWarps)
-      res *= v;
-    return res;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Parameters that control copy/padding transfers from global to shared.
-  //===--------------------------------------------------------------------===//
-  SmallVector<Type> paddingValueTypes;
-  SmallVector<int64_t> paddingDimensions;
-  SmallVector<int64_t> packingDimensions;
-
-  ArrayAttr getZeroPadAttrFromElementalTypes(OpBuilder &b) const;
-
-  virtual Type getLhsElementalType() const = 0;
-  virtual Type getRhsElementalType() const = 0;
-  virtual Type getResElementalType() const = 0;
-
-  int64_t lhsElementalBitWidth() const {
-    return getLhsElementalType().getIntOrFloatBitWidth();
-  }
-  int64_t rhsElementalBitWidth() const {
-    return getRhsElementalType().getIntOrFloatBitWidth();
-  }
-  int64_t resElementalBitWidth() const {
-    return getResElementalType().getIntOrFloatBitWidth();
-  }
-
-  bool alignedLhs() const {
-    return m() % blockTileM() == 0 && k() % reductionTileSize == 0;
-  }
-  bool alignedRhs() const {
-    return n() % blockTileN() == 0 && k() % reductionTileSize == 0;
-  }
-  bool alignedRes() const {
-    return m() % blockTileM() == 0 && n() % blockTileN() == 0;
-  }
-
-  virtual MappingInfo lhsCopyMapping() const = 0;
-  virtual LogicalResult validateLhsCopyMapping() const = 0;
-  virtual MappingInfo rhsCopyMapping() const = 0;
-  virtual LogicalResult validateRhsCopyMapping() const = 0;
-  virtual MappingInfo resCopyMapping() const = 0;
-  virtual LogicalResult validateResCopyMapping() const = 0;
-
-  /// Validates the mapping and emits a diagnostic on failure.
-  LogicalResult validateCopyMapping(MLIRContext *ctx,
-                                    const MappingInfo &mapping,
-                                    StringRef name) const;
-
-  //===--------------------------------------------------------------------===//
-  // Parameters that control compute mapping decisions.
-  //===--------------------------------------------------------------------===//
-  bool useAsyncCopies;
-  bool useMmaSync;
-  bool useWmma;
-  bool useFma;
-  int64_t pipelineDepth;
-  bool peelPipelineEpilogue;
-  virtual MappingInfo computeMapping() const = 0;
-
-  virtual LogicalResult validate(const GPUModel &gpuModel) const;
-
-  //===--------------------------------------------------------------------===//
-  // Problem-related quantities.
-  //===--------------------------------------------------------------------===//
-  virtual int64_t m() const = 0;
-  virtual int64_t n() const = 0;
-  virtual int64_t k() const = 0;
-
-  virtual void print(llvm::raw_ostream &os) const = 0;
-  virtual LLVM_DUMP_METHOD void dump() const = 0;
-
-  //===--------------------------------------------------------------------===//
-  // Preconditions of internal transforms lifted to the top-level for more
-  // actionnable error messages. In the fullness of time, transforms should
-  // expose preconditions and we should aggregate them automatically.
-  //===--------------------------------------------------------------------===//
-
-  // TODO: To handle different element types efficiently, it would be much
-  // better to expose the unrolling to native size explicitly to the transforms
-  // rather than hide it behind an opaque transform.
-
-  // wmma preconditions that we want to lift out in an actionnable top-level
-  // error message instead of failing late in the transformation schedule.
-  // TODO: These are now hardcoded for f32 but are element-type dependent.
-  // Precondition: the pipeline transformation for wmma requires at least 2
-  // k-groups.
-  constexpr static int64_t kMinWmmaMinM = 16;
-  constexpr static int64_t kMinWmmaMinN = 16;
-  constexpr static int64_t kMinWmmaMinK = 8;
-
-  // mma.sync preconditions that we want to lift out in an actionnable top-level
-  // error message instead of failing late in the transformation schedule.
-  // TODO: These are now hardcoded for f32 but are element-type dependent.
-  // Precondition: the pipeline transformation for mma.sync requires at least 2
-  // k-groups.
-  constexpr static int64_t kMinMmaSyncGroups = 2;
-  // Precondition: the pipeline transformation for mma.sync requires at least a
-  // pipeline depth of 3.
-  constexpr static int64_t kMinMmaSyncPipelineDepth = 3;
-  // Precondition: if mma.sync is used, the tile sizes must be at least 8x8x4.
-  constexpr static int64_t kMinMmaSyncMinM = 8;
-  constexpr static int64_t kMinMmaSyncMinN = 8;
-  constexpr static int64_t kMinMmaSyncMinK = 4;
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
deleted file mode 100644
index 33ea8e9894ed..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_compiler_cc_library(
-    name = "GPU",
-    srcs = [
-        "AbstractGemmLikeStrategy.cpp",
-        "Common.cpp",
-        "ConvolutionImplicitGemmStrategy.cpp",
-        "CopyMapping.cpp",
-        "MappingInfo.cpp",
-        "MatmulTensorCoreStrategy.cpp",
-        "PadStrategy.cpp",
-        "SmallReductionStrategy.cpp",
-        "StagedReductionStrategy.cpp",
-        "Strategies.cpp",
-    ],
-    hdrs = [
-        "AbstractGemmLikeStrategy.h",
-        "Common.h",
-        "ConvolutionImplicitGemmStrategy.h",
-        "CopyMapping.h",
-        "MappingInfo.h",
-        "MatmulTensorCoreStrategy.h",
-        "PadStrategy.h",
-        "SmallReductionStrategy.h",
-        "StagedReductionStrategy.h",
-        "Strategies.h",
-    ],
-    deps = [
-        # Dialects
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions",
-        "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MemRefTransformOps",
-        "@llvm-project//mlir:NVGPUDialect",
-        "@llvm-project//mlir:PDLDialect",
-        "@llvm-project//mlir:PDLInterpDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFTransformOps",
-        "@llvm-project//mlir:SCFUtils",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TensorTransformOps",
-        "@llvm-project//mlir:TransformDialect",
-        "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorTransformOps",
-        # IR
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Rewrite",
-        # Other Stuff
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:DialectUtils",
-        # TransformStrategies
-        "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies",
-        # TransformExtensions
-        "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions",
-        "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
-        "@llvm-project//mlir:LinalgTransformOps",
-        # TransformMatchers and other stuff
-        "//llvm-external-projects/iree-dialects:IREEDialectsTransforms",
-    ],
-)
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
deleted file mode 100644
index 48b44f3ef353..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel       #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_cc_library(
-  NAME
-    GPU
-  HDRS
-    "AbstractGemmLikeStrategy.h"
-    "Common.h"
-    "ConvolutionImplicitGemmStrategy.h"
-    "CopyMapping.h"
-    "MappingInfo.h"
-    "MatmulTensorCoreStrategy.h"
-    "PadStrategy.h"
-    "SmallReductionStrategy.h"
-    "StagedReductionStrategy.h"
-    "Strategies.h"
-  SRCS
-    "AbstractGemmLikeStrategy.cpp"
-    "Common.cpp"
-    "ConvolutionImplicitGemmStrategy.cpp"
-    "CopyMapping.cpp"
-    "MappingInfo.cpp"
-    "MatmulTensorCoreStrategy.cpp"
-    "PadStrategy.cpp"
-    "SmallReductionStrategy.cpp"
-    "StagedReductionStrategy.cpp"
-    "Strategies.cpp"
-  DEPS
-    IREEDialectsTransforms
-    IREELinalgTransformDialect
-    LLVMSupport
-    MLIRAffineDialect
-    MLIRAffineUtils
-    MLIRAnalysis
-    MLIRArithDialect
-    MLIRArithUtils
-    MLIRAsyncDialect
-    MLIRBufferizationDialect
-    MLIRBufferizationTransforms
-    MLIRFuncDialect
-    MLIRFunctionInterfaces
-    MLIRGPUDialect
-    MLIRIR
-    MLIRLLVMDialect
-    MLIRLinalgDialect
-    MLIRLinalgTransformOps
-    MLIRMemRefDialect
-    MLIRMemRefTransformOps
-    MLIRNVGPUDialect
-    MLIRPDLDialect
-    MLIRPDLInterpDialect
-    MLIRParser
-    MLIRPass
-    MLIRRewrite
-    MLIRSCFDialect
-    MLIRSCFTransformOps
-    MLIRSCFUtils
-    MLIRSupport
-    MLIRTensorDialect
-    MLIRTensorTransformOps
-    MLIRTransformDialect
-    MLIRVectorDialect
-    MLIRVectorTransformOps
-    iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions
-    iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions
-    iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies
-    iree::compiler::Dialect::LinalgExt::IR
-    iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions
-  PUBLIC
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
deleted file mode 100644
index 30322269d80d..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
+++ /dev/null
@@ -1,697 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-
-#include <tuple>
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/TypeUtilities.h"
-
-using namespace mlir;
-
-// TODO: significantly better namespacing.
-using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using iree_compiler::IREE::transform_dialect::MapNestedForallToGpuThreadsOp;
-using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op;
-using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp;
-
-using iree_compiler::buildReductionStrategyBlockDistribution;
-using iree_compiler::buildTileFuseDistToForallWithNumThreads;
-using iree_compiler::buildTileFuseDistToForallWithTileSizes;
-using iree_compiler::maxDivisorOfValueBelowLimit;
-using iree_compiler::TileToForallAndFuseAndDistributeResult;
-using iree_compiler::gpu::AbstractGemmLikeStrategy;
-using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping;
-using iree_compiler::gpu::buildCommonTrailingStrategy;
-using iree_compiler::gpu::buildMapToBlockAndThreads;
-using iree_compiler::gpu::GPUModel;
-using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
-using iree_compiler::IREE::transform_dialect::IREEBufferizeOp;
-using iree_compiler::IREE::transform_dialect::IREEEliminateEmptyTensorsOp;
-using iree_compiler::IREE::transform_dialect::ShareForallOperandsOp;
-using iree_compiler::IREE::transform_dialect::SynchronizeLoopOp;
-using transform::FuseIntoContainingOp;
-using transform::MatchOp;
-using transform::MemRefEraseDeadAllocAndStoresOp;
-using transform::RewriteInDestinationPassingStyleOp;
-using transform::ScalarizeOp;
-using transform::SequenceOp;
-
-//===----------------------------------------------------------------------===//
-// General helpers.
-//===----------------------------------------------------------------------===//
-
-/// Return max(1, (value * 32) / bitwidth).
-int64_t mlir::iree_compiler::gpu::scaleUpByBitWidth(int64_t value,
-                                                    int64_t bitWidth) {
-  assert((bitWidth & (bitWidth - 1)) == 0 && "bitWidth must be a power of 2");
-  return std::max((value * 32) / bitWidth, int64_t(1));
-}
-
-/// Adjust the number of warps to use to benefit from packing multiple smaller
-/// elemental types within a single 128 bit shuffled element.
-int64_t mlir::iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle(
-    int64_t numWarpsToUse, int64_t bitWidth) {
-  // Try to scale down the number of warps to use 32b elements in warp shuffles.
-  assert(((bitWidth & (bitWidth - 1)) == 0) && "bitWidth must be a power of 2");
-  int64_t factor;
-  for (factor = scaleUpByBitWidth(1, bitWidth); factor > 1; factor >>= 1)
-    if (numWarpsToUse % factor == 0)
-      break;
-  numWarpsToUse /= factor;
-  // Try to scale to using 128b elements in warp shuffles.
-  return std::max(numWarpsToUse / 4, int64_t(1));
-}
-
-/// Compute the (splitPoint, vectorSize) pair to break [0 .. upperBound] into
-/// [0 .. splitPoint] and [splitPoint + 1 .. upperBound] such that `splitPoint`
-/// is a multiple of `fixedSize * vectorSize`.
-/// The returned `vectorSize` is the maximal power of `2`, smaller than
-/// `maxVectorSize`, for which `splitPoint` can be computed.
-///
-/// Note: `vectorSize` may be smaller than `maxVectorSize` when the upperBound
-/// is small enough. In such cases we give preference to keeping the `fixedSize`
-/// parameter unchanged and reducing the `vectorSize`. `fixedSize` generally
-/// captures the number of threads and we do not alter decisions on parallelism
-/// at this level.
-///
-/// If such a positive multiple exists:
-///   1. if it is `upperBound`, then `upperBound` is an even multiple of
-///      `fixedSize` * `vectorSize` and we can tile evenly without splitting.
-///      In this case we return (0, vectorSize).
-///   2. otherwise, it is a split point at which we can split with vectorSize
-///      to obtain the largest divisible tiling.
-///      In this case we return (splitPoint, vectorSize).
-/// Otherwise we return (0, 1) to signify no splitting and a vector size of 1.
-// TODO: support the dynamic case, taking future stride and alignment into
-// account and returning Values. The op then needs to become part of the
-// transform dialect.
-static std::pair<int64_t, int64_t> computeSplitPoint(int64_t upperBound,
-                                                     int64_t fixedSize,
-                                                     int64_t maxVectorSize) {
-  assert((maxVectorSize & (maxVectorSize - 1)) == 0 && "must be a power of 2");
-  if (ShapedType::isDynamic(upperBound)) {
-    return std::make_pair(int64_t(0), int64_t(1));
-  }
-  for (int64_t vectorSize = maxVectorSize; vectorSize >= 1; vectorSize >>= 1) {
-    int64_t splitPoint =
-        iree_compiler::previousMultipleOf(upperBound, fixedSize * vectorSize);
-    if (splitPoint > 0) {
-      return (upperBound == splitPoint)
-                 ? std::make_pair(int64_t(0), vectorSize)
-                 : std::make_pair(splitPoint, vectorSize);
-    }
-  }
-  return std::make_pair(int64_t(0), int64_t(1));
-}
-
-//===----------------------------------------------------------------------===//
-// Low-level reusable retargetable builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-/// Post-bufferization mapping to blocks and threads.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-Value mlir::iree_compiler::gpu::buildMapToBlockAndThreads(
-    ImplicitLocOpBuilder &b, Value funcH, ArrayRef<int64_t> blockSize,
-    std::optional<int64_t> subgroupSize) {
-  b.create<ForallToWorkgroupOp>(funcH);
-  auto mapToThreadsOp =
-      b.create<MapNestedForallToGpuThreadsOp>(funcH, blockSize);
-  if (subgroupSize)
-    mapToThreadsOp.setSubgroupSize(*subgroupSize);
-  return funcH;
-}
-
-/// Post-bufferization vector distribution with rank-reduction.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-Value mlir::iree_compiler::gpu::buildDistributeVectors(ImplicitLocOpBuilder &b,
-                                                       Value variantH,
-                                                       Value funcH,
-                                                       int64_t warpSize) {
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-    b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
-  });
-  Value ifH = b.create<MatchOp>(funcH, scf::IfOp::getOperationName());
-  // Locally suppress failures for this op only because it doesn't cover the
-  // `threadIdx.x == 0 && threadIdx.y == 0` case at the moment.
-  auto sequence = b.create<SequenceOp>(
-      TypeRange(), transform::FailurePropagationMode::Suppress, variantH,
-      /*extraBindings=*/ValueRange());
-  {
-    OpBuilder::InsertionGuard guard(b);
-    b.createBlock(&sequence.getBody(), sequence.getBody().begin(),
-                  transform::AnyOpType::get(b.getContext()), b.getLoc());
-    ifH = b.create<VectorToWarpExecuteOnLane0Op>(ifH, warpSize);
-    b.create<transform::YieldOp>();
-  }
-  b.create<VectorWarpDistributionOp>(funcH);
-  return funcH;
-}
-
-//===----------------------------------------------------------------------===//
-// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-void mlir::iree_compiler::gpu::
-    build1DSplittingStrategyWithOptionalThreadMapping(
-        ImplicitLocOpBuilder &b, Value variantH, Value opH, int64_t rank,
-        int64_t mostMinorDim, SmallVector<int64_t> opSizes, int64_t numThreads,
-        Attribute mappingAttr, int64_t maxVectorSize) {
-  // Poor man's handling of optionality in C++. Will need to be converted to
-  // proper transform dialect filters or handling of emptiness.
-  if (rank == 0)
-    return;
-
-  // Compute split point to guarantee we form a maximal chunk divisible by
-  // numThreads * vectorSize.
-  // This chunk is currently not aligned for proper vector accesses.
-  // In the future, this can be solved either by:
-  //   1. doing an extra prologue split that is cognizant of the future stride.
-  //   2. or, aligning allocations to a multiple of 128b on the most minor
-  //      dimensions but without changing problem sizes (i.e. poor man's
-  //      packing).
-  int64_t mostMinorSize = opSizes[mostMinorDim];
-  auto [splitPoint, vectorSize] = computeSplitPoint(
-      /*upperBound=*/mostMinorSize, /*fixedSize=*/numThreads,
-      /*maxVectorSize=*/maxVectorSize);
-
-  // Create 1-D tile sizes for the first, divisible, part.
-  SmallVector<int64_t> scfForTileSizes(rank, 0), foreachTileSizes(rank, 0);
-  scfForTileSizes[mostMinorDim] = numThreads * vectorSize;
-  foreachTileSizes[mostMinorDim] = numThreads;
-
-  // Split, tile and map the most minor dimension to `mappingAttr`.
-  if (splitPoint > 0) {
-    auto anyOpType = transform::AnyOpType::get(b.getContext());
-    auto split = b.create<transform::SplitOp>(
-        anyOpType, anyOpType, opH, mostMinorDim, Value(), splitPoint);
-    opH = split.getFirst();
-    if (vectorSize > 1) {
-      auto res = iree_compiler::buildTileFuseToScfFor(
-          /*b=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/opH,
-          /*opsHToFuse=*/{},
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr({scfForTileSizes})));
-      opH = res.tiledOpH;
-      // Reset the vector size to 1 for the tail, which is known to not be
-      // divisible by `numThreads * vectorSize`.
-      vectorSize = 1;
-    }
-    if (numThreads > 1) {
-      assert(mappingAttr && "must specify a mapping attribute");
-      iree_compiler::buildTileFuseDistToForallWithNumThreads(
-          /*b=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/opH,
-          /*opsHToFuse=*/{},
-          /*numThreads=*/getAsOpFoldResult(b.getI64ArrayAttr(foreachTileSizes)),
-          /*threadDimMapping=*/b.getArrayAttr({mappingAttr}));
-    }
-    opH = split.getSecond();
-  }
-
-  // Tile and map the most minor dimension of the remainder to mappingAttr.
-  if (vectorSize > 1) {
-    auto res = iree_compiler::buildTileFuseToScfFor(
-        /*b=*/b,
-        /*variantH=*/variantH,
-        /*rootH=*/opH,
-        /*opsHToFuse=*/{},
-        /*tileSizes=*/getAsOpFoldResult(b.getI64ArrayAttr({scfForTileSizes})));
-    opH = res.tiledOpH;
-  }
-  if (numThreads > 1) {
-    assert(mappingAttr && "must specify a mapping attribute");
-    iree_compiler::buildTileFuseDistToForallWithNumThreads(
-        /*b=*/b,
-        /*variantH=*/variantH,
-        /*rootH=*/opH,
-        /*opsHToFuse=*/{},
-        /*numThreads=*/getAsOpFoldResult(b.getI64ArrayAttr(foreachTileSizes)),
-        /*threadDimMapping=*/b.getArrayAttr({mappingAttr}));
-  }
-}
-
-/// Take care of the last common steps in a GPU strategy (i.e. vectorize,
-/// bufferize, maps to blocks and threads and distribute vectors).
-/// Return the handles to the updated variant and the func::FuncOp ops under
-/// the variant op.
-std::pair<Value, Value> mlir::iree_compiler::gpu::buildCommonTrailingStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    ArrayRef<int64_t> numThreadsInBlock) {
-  Value funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-
-  // Step N-5. Fold tensor.empty to avoid large allocations.
-  // Step N-4. Perform a pass of canonicalization + enabling after tiling.
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldTensorEmptyPatternsOp>(loc);
-      });
-  funcH = iree_compiler::buildVectorize(b, funcH);
-
-  // Step N-3. Perform a pass of canonicalization + enabling after vectorization
-  // as well as hoisting subset operations such as vector.transfer_read/write.
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldTensorEmptyPatternsOp>(loc);
-      });
-  iree_compiler::buildHoisting(b, funcH);
-
-  // Step N-2. Bufferize and drop HAL descriptor from memref ops.
-  variantH = iree_compiler::buildBufferize(b, variantH, /*targetGpu=*/true);
-
-  // Step N-1. Post-bufferization mapping to blocks and threads.
-  // Need to match again since bufferize invalidated all handles.
-  // TODO: assumes a single func::FuncOp to transform, may need hardening.
-  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  funcH = buildMapToBlockAndThreads(b, funcH, numThreadsInBlock);
-
-  // Step N. Perform a final pass of canonicalization + enabling before
-  // returning.
-  mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldTensorEmptyPatternsOp>(loc);
-      });
-  return std::make_pair(variantH, funcH);
-}
-
-//===----------------------------------------------------------------------===//
-// Subset of mid-level builders currently used for GEMM-like problems.
-//===----------------------------------------------------------------------===//
-
-/// Build transform IR to hoist the padded output operand of a padded matmul.
-/// Additionally, this attempts to fold the padding into the producing fill, if
-/// available.
-Value mlir::iree_compiler::gpu::buildHoistOutputPaddingOp(
-    ImplicitLocOpBuilder &b, Value variantH, Value paddedMatmulOpH,
-    int64_t numLoopsToHoist) {
-  // Find the output pad and hoist it.
-  // TODO: don't hardcode output operand number.
-  // TODO: Better builders.
-  Value outputH = b.create<transform::GetProducerOfOperand>(
-      paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(2));
-
-  // Hoist the padding above the 1 innermost reduction loop.
-  auto padOpType = transform::OperationType::get(
-      b.getContext(), tensor::PadOp::getOperationName());
-  outputH = b.create<transform::CastOp>(padOpType, outputH);
-  b.create<transform::HoistPadOp>(paddedMatmulOpH.getType(), outputH,
-                                  numLoopsToHoist);
-
-  // Perform a pass of canonicalization cleanups + folding fill + pad into pad
-  // by applying `foldTensorSubsets` and `tilingCanonicalization`.
-  {
-    Value funcH = b.create<transform::MatchOp>(
-        variantH, func::FuncOp::getOperationName());
-    iree_compiler::buildCanonicalizationAndEnablingTransforms(
-        b, funcH, [](OpBuilder &b, Location loc) {
-          b.create<transform::ApplyFoldTensorSubsetOpsPatternsOp>(loc);
-          b.create<
-              transform::ApplyMergeConsecutiveInsertExtractSlicePatternsOp>(
-              loc);
-        });
-  }
-
-  // The canonicalization above should have rewritten hoistPad into a FillOp.
-  // Unfortunately, the listener drops handles if the op types don't match. We
-  // need better behavior here, for now we rematch.
-  // TODO: use value handles.
-  Value fillOpH = b.create<transform::MatchOp>(
-      variantH, linalg::FillOp::getOperationName());
-
-  return fillOpH;
-}
-
-/// Helper function to distribute one pad or copy operation.
-/// Note: When `foldIfBranch` is true, one must later perform masked
-/// vectorization of the result.
-/// This amounts to injecting knowledge about future transformations without
-/// adding leaky semantics.
-std::tuple<Value, Value>
-mlir::iree_compiler::gpu::buildDistributeOnePadOrCopyWithTileSizes(
-    ImplicitLocOpBuilder &b, Value variantH, Value copyOpH,
-    ArrayRef<int64_t> tileSizes, ArrayRef<Attribute> threadDimMapping,
-    bool foldIfBranch) {
-  TileToForallAndFuseAndDistributeResult res =
-      buildTileFuseDistToForallWithTileSizes(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/copyOpH,
-          /*opsToFuseH=*/{},
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
-          /*threadDimMapping=*/
-          b.getArrayAttr(threadDimMapping));
-  if (foldIfBranch) {
-    Value ifOpH = b.create<transform::MatchOp>(res.forallH,
-                                               scf::IfOp::getOperationName());
-    b.create<transform::TakeAssumedBranchOp>(
-        ifOpH, /*takeElseBranch=*/b.getUnitAttr());
-  }
-  return std::make_tuple(res.tiledOpH, res.forallH);
-}
-
-/// Helper function to distribute one pad or copy operation.
-/// Note: When `foldIfBranch` is true, one must later perform masked
-/// vectorization of the result.
-/// This amounts to injecting knowledge about future transformations without
-/// adding leaky semantics.
-Value mlir::iree_compiler::gpu::buildDistributeOnePadOrCopyWithNumThreads(
-    ImplicitLocOpBuilder &b, Value variantH, Value copyOpH,
-    ArrayRef<int64_t> numThreads, ArrayRef<Attribute> threadDimMapping,
-    bool foldIfBranch) {
-  TileToForallAndFuseAndDistributeResult res =
-      buildTileFuseDistToForallWithNumThreads(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/copyOpH,
-          /*opsToFuseH=*/{},
-          /*numThreads=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(numThreads)),
-          /*threadDimMapping=*/
-          b.getArrayAttr(threadDimMapping));
-  if (foldIfBranch) {
-    Value ifOpH = b.create<transform::MatchOp>(res.forallH,
-                                               scf::IfOp::getOperationName());
-    b.create<transform::TakeAssumedBranchOp>(
-        ifOpH, /*takeElseBranch=*/b.getUnitAttr());
-  }
-  return res.tiledOpH;
-}
-
-/// Distribute the explicit copies involved in a matmul operation
-/// `paddedMatmulOpH`.
-std::tuple<Value, Value, Value>
-mlir::iree_compiler::gpu::buildDistributeMatmulCopies(
-    ImplicitLocOpBuilder &b, Value variantH, Value paddedMatmulOpH,
-    const AbstractGemmLikeStrategy &strategy) {
-  // Aligned vs unaligned handling deviates here by converting the pads to
-  // copies for the aligned case.
-  // TODO: Unify aligned and unaligned codegen.
-  Value copyBackOpH;
-  if (!strategy.alignedRes()) {
-    // Explicitly materialize the parent parallel_insert into a copy to avoid
-    // late bufferization interferences.
-    // TODO: Avoid brittle rematching.
-    Value insertSliceH = b.create<transform::MatchOp>(
-        variantH, tensor::ParallelInsertSliceOp::getOperationName());
-    copyBackOpH = b.create<transform::InsertSliceToCopyOp>(
-        insertSliceH.getType(), insertSliceH);
-  } else {
-    Value resH = b.create<transform::GetProducerOfOperand>(
-        paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(2));
-    copyBackOpH =
-        b.create<RewriteInDestinationPassingStyleOp>(resH.getType(), resH);
-  }
-
-  Value lhsH = b.create<transform::GetProducerOfOperand>(
-      paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(0));
-  Value rhsH = b.create<transform::GetProducerOfOperand>(
-      paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(1));
-
-  // Rewrite aligned pads as destination passing (linalg.copy)
-  if (strategy.alignedLhs() && strategy.packingDimensions[0])
-    lhsH = b.create<RewriteInDestinationPassingStyleOp>(lhsH.getType(), lhsH);
-  if (strategy.alignedRhs() && strategy.packingDimensions[1])
-    rhsH = b.create<RewriteInDestinationPassingStyleOp>(rhsH.getType(), rhsH);
-
-  MappingInfo lhsCopyMapping = strategy.lhsCopyMapping();
-  Value lhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
-      b, variantH, lhsH, /*numThreads=*/lhsCopyMapping.numThreads,
-      /*threadDimMapping=*/lhsCopyMapping.threadMapping,
-      /*foldIfBranch=*/!strategy.alignedLhs());
-
-  MappingInfo rhsCopyMapping = strategy.rhsCopyMapping();
-  Value rhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
-      b, variantH, rhsH, /*numThreads=*/rhsCopyMapping.numThreads,
-      /*threadDimMapping=*/rhsCopyMapping.threadMapping,
-      /*foldIfBranch=*/!strategy.alignedRhs());
-
-  if (!strategy.alignedRes()) {
-    MappingInfo resCopyMapping = strategy.resCopyMapping();
-    copyBackOpH = buildDistributeOnePadOrCopyWithNumThreads(
-        b, variantH, copyBackOpH,
-        /*numThreads=*/resCopyMapping.numThreads,
-        /*threadDimMapping=*/resCopyMapping.threadMapping);
-  }
-
-  return std::make_tuple(lhsCopyOpH, rhsCopyOpH, copyBackOpH);
-}
-
-/// Specific pattern to perform masked vectorization of copies give as
-/// parameters, cleanup and vectorize the rest.
-// TODO: generalize and don't hardcode.
-void mlir::iree_compiler::gpu::buildMatmulVectorization(
-    ImplicitLocOpBuilder &b, Value variantH, Value lhsCopyOpH, Value rhsCopyOpH,
-    Value copyBackOpH, const AbstractGemmLikeStrategy &strategy,
-    bool vectorizePadding, bool vectorizeNdExtract) {
-  // Canonicalize to make padOp outputs static shaped: this is currently a
-  // prerequisite for vector masking.
-  // Also, no canonicalization is allowed after vector masking and before we
-  // lower the masks: masks are currently quite brittle and do not like
-  // canonicalization or anything else that may insert an op in their region.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-
-  // Apply vector masking.
-  if (!strategy.alignedLhs()) {
-    MappingInfo lhsCopyMapping = strategy.lhsCopyMapping();
-    SmallVector<bool> scalableSizes(lhsCopyMapping.tileSizes.size(), false);
-    b.create<transform::VectorizeOp>(lhsCopyOpH, ValueRange(),
-                                     lhsCopyMapping.tileSizes, nullptr,
-                                     scalableSizes);
-  }
-  if (!strategy.alignedRhs()) {
-    MappingInfo rhsCopyMapping = strategy.rhsCopyMapping();
-    SmallVector<bool> scalableSizes(rhsCopyMapping.tileSizes.size(), false);
-    b.create<transform::VectorizeOp>(rhsCopyOpH, ValueRange(),
-                                     rhsCopyMapping.tileSizes, nullptr,
-                                     scalableSizes);
-  }
-  if (!strategy.alignedRes()) {
-    MappingInfo resCopyMapping = strategy.resCopyMapping();
-    SmallVector<bool> scalableSizes(resCopyMapping.tileSizes.size(), false);
-    b.create<transform::VectorizeOp>(copyBackOpH, ValueRange(),
-                                     resCopyMapping.tileSizes, nullptr,
-                                     scalableSizes);
-  }
-
-  // Lower all masked vector transfers at this point, as they make
-  // canonicalization generate incorrect IR.
-  // TODO: don't rematch, apply on the variant op directly.
-  funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  buildLowerMaskedTransfersAndCleanup(b, funcH, /*cleanup=*/false);
-
-  // Apply vectorization + cleanups to what remains.
-  funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true,
-                                        vectorizePadding, vectorizeNdExtract);
-}
-
-/// Build the transform IR to perform conversion to tensor core operations.
-/// This is currently subject to phase orderings as follows:
-///   - Vector transfer_read and transfer_write patterns have different subview
-///     folding behavior, force a fold_memref_aliases on them to enable
-///     redundant vector transfer hoisting.
-///   - Unfortunately, fold_memref_aliases breaks vector_to_mma conversion
-///     across scf.for after unrolling due to insert_strided_slice /
-///     extract_strided_slice across iter_args boundaries.
-///   - Hoist redundant vector transfers to allow conversion to tensor core to
-///     proceed. We really don't want to do this after bufferization but we need
-///     to atm.
-Value mlir::iree_compiler::gpu::buildConvertToTensorCoreOp(
-    ImplicitLocOpBuilder &b, Value funcH,
-    const AbstractGemmLikeStrategy &strategy) {
-  // TODO: Fewer canonicalization.
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  b.create<iree_compiler::IREE::transform_dialect::HoistStaticAllocOp>(funcH);
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-  });
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyExtractAddressComputationsPatternsOp>(loc);
-  });
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  if (strategy.useWmma) {
-    b.create<transform::ApplyPatternsOp>(
-        funcH, [&](OpBuilder &b, Location loc) {
-          b.create<iree_compiler::IREE::transform_dialect::
-                       ApplyUnrollVectorsGpuWmmaSyncPatternsOp>(loc);
-        });
-  } else if (strategy.useMmaSync) {
-    b.create<transform::ApplyPatternsOp>(
-        funcH, [&](OpBuilder &b, Location loc) {
-          b.create<iree_compiler::IREE::transform_dialect::
-                       ApplyUnrollVectorsGpuMmaSyncPatternsOp>(loc);
-        });
-  } /* else nothing to do for fma here */
-
-  Value forH = b.create<transform::MatchOp>(
-      transform::OperationType::get(b.getContext(), "scf.for"), funcH,
-      b.getStrArrayAttr({scf::ForOp::getOperationName()}),
-      /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(),
-      /*opAttrs=*/DictionaryAttr(), /*filterResultType=*/TypeAttr(),
-      /*filterOperandTYpes=*/ArrayAttr());
-  // TODO: At this time, this synchronization is needed for applying the
-  // HoistRedundantVectorTransfersOp transform correctly. This is because the
-  // transform does not take parallelism into accound.
-  // In the future, HoistRedundantVectorTransfersOp + SynchronizeLoopOp need to
-  // be replaced by a single transform.
-  b.create<SynchronizeLoopOp>(forH);
-
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-  });
-  b.create<mlir::transform::ApplyCommonSubexpressionEliminationOp>(funcH);
-  // TODO: not a functional style transform and avoid returning funcH.
-  funcH = b.create<transform::HoistRedundantVectorTransfersOp>(
-      transform::AnyOpType::get(b.getContext()), funcH);
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  b.create<MemRefEraseDeadAllocAndStoresOp>(funcH);
-
-  if (strategy.useWmma) {
-    auto vectorToMMaConversionOp = b.create<
-        iree_compiler::IREE::transform_dialect::VectorToMMAConversionOp>(funcH);
-    // TODO: proper builder instead of a setting post-hoc.
-    vectorToMMaConversionOp.setUseWmma(true);
-  } else if (strategy.useMmaSync) {
-    auto vectorToMMaConversionOp = b.create<
-        iree_compiler::IREE::transform_dialect::VectorToMMAConversionOp>(funcH);
-    // TODO: proper builder instead of a setting post-hoc.
-    vectorToMMaConversionOp.setUseMmaSync(true);
-  } /* else nothing to do for fma here */
-
-  // Post-hoc elimiation of barriers.
-  funcH = b.create<EliminateGpuBarriersOp>(funcH);
-  return funcH;
-}
-
-void mlir::iree_compiler::gpu::buildMultiBuffering(
-    ImplicitLocOpBuilder &b, Value funcH,
-    const AbstractGemmLikeStrategy &strategy) {
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-  });
-  // TODO: Avoid brittle matching here.
-  // TODO: Better builder after integrate.
-  Value allocH = b.create<transform::MatchOp>(
-      transform::OperationType::get(b.getContext(), "memref.alloc"), funcH,
-      b.getStrArrayAttr({memref::AllocOp::getOperationName()}),
-      /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(),
-      /*opAttrs=*/DictionaryAttr(), /*filterResultType=*/TypeAttr(),
-      /*filterOperandTYpes=*/ArrayAttr());
-  // TODO: Better builder instead of setting post-hoc.
-  auto multiBufferOp = b.create<transform::MemRefMultiBufferOp>(
-      transform::AnyOpType::get(b.getContext()), allocH);
-  multiBufferOp.setFactor(strategy.pipelineDepth);
-  multiBufferOp.setSkipAnalysis(true);
-}
-
-Value mlir::iree_compiler::gpu::buildConvertToAsyncCopies(
-    ImplicitLocOpBuilder &b, Value funcH,
-    const AbstractGemmLikeStrategy &strategy) {
-  b.create<transform::ApplyPatternsOp>(funcH, [&](OpBuilder &b, Location loc) {
-    // Atm, vectors need to be lowered to 1-D for cp.async mapping to connect.
-    // TODO: not a functional style op to avoid invalidating artificially.
-    auto transferToScfOp =
-        b.create<transform::ApplyTransferToScfPatternsOp>(loc);
-    // TODO: proper builder instead of a setting post-hoc.
-    transferToScfOp.setMaxTransferRank(1);
-    transferToScfOp.setFullUnroll(true);
-  });
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-  auto createAsyncGroupOp =
-      b.create<iree_compiler::IREE::transform_dialect::CreateAsyncGroupsOp>(
-          TypeRange{}, funcH);
-  if (strategy.useMmaSync) {
-    // TODO: proper builder instead of a setting post-hoc.
-    createAsyncGroupOp.setUseMmaSync(strategy.useMmaSync);
-  }
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(
-      b, funcH, [](OpBuilder &b, Location loc) {
-        b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
-      });
-  return funcH;
-}
-
-void mlir::iree_compiler::gpu::buildPipelineSharedMemoryCopies(
-    ImplicitLocOpBuilder &b, Value funcH,
-    const AbstractGemmLikeStrategy &strategy) {
-  Value computeOpH;
-  if (strategy.useWmma) {
-    computeOpH = b.create<transform::MatchOp>(
-        funcH, mlir::gpu::SubgroupMmaComputeOp::getOperationName());
-  } else if (strategy.useMmaSync) {
-    computeOpH = b.create<transform::MatchOp>(
-        funcH, mlir::nvgpu::MmaSyncOp::getOperationName());
-  } else {
-    assert(strategy.useFma);
-    computeOpH = b.create<transform::MatchOp>(
-        funcH, mlir::vector::ContractionOp::getOperationName());
-  }
-  // TODO: Better builder.
-  Value forOpH = b.create<transform::GetParentOp>(
-      transform::AnyOpType::get(b.getContext()), computeOpH,
-      /*isolated_from_above=*/false, /*allow_empty_results=*/false,
-      /*op_name=*/b.getStringAttr("scf.for"), /*deduplicate=*/true);
-  // TODO: Better builder instead of setting post-hoc.
-  auto pipelineOp = b.create<
-      iree_compiler::IREE::transform_dialect::PipelineSharedMemoryCopiesOp>(
-      transform::AnyOpType::get(b.getContext()), forOpH);
-  // TODO: depth from strategy, or directly from individual buffers.
-  pipelineOp.setDepth(strategy.pipelineDepth);
-  pipelineOp.setUseMmaSync(strategy.useMmaSync);
-  pipelineOp.setPeelEpilogue(strategy.peelPipelineEpilogue);
-}
-
-Value mlir::iree_compiler::gpu::buildBufferize(ImplicitLocOpBuilder &b,
-                                               Value variantH) {
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<transform::ApplyCanonicalizationPatternsOp>(loc);
-  });
-  b.create<IREE::transform_dialect::IREEApplyLoopIndependentCodeMotionOp>(
-      funcH);
-  b.create<mlir::transform::ApplyCommonSubexpressionEliminationOp>(funcH);
-  b.create<IREEEliminateEmptyTensorsOp>(funcH);
-  auto bufferizeOp = b.create<IREEBufferizeOp>(funcH, /*targetGpu=*/true);
-  bufferizeOp.setTargetGpu(true);
-  variantH = bufferizeOp.getResult();
-  Value memrefFunc =
-      b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  b.create<MemRefEraseDeadAllocAndStoresOp>(memrefFunc);
-  return variantH;
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h
deleted file mode 100644
index 6d2934e0c342..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/IR/BuiltinOps.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct GPUModel;
-
-//===----------------------------------------------------------------------===//
-// Base quantities generally useful for all GPU strategies.
-//===----------------------------------------------------------------------===//
-inline Attribute threadX(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimX);
-}
-inline Attribute threadY(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimY);
-}
-inline Attribute threadZ(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ);
-}
-inline Attribute warpX(MLIRContext *ctx) {
-  return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimX);
-}
-inline Attribute warpY(MLIRContext *ctx) {
-  return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimY);
-}
-inline Attribute warpZ(MLIRContext *ctx) {
-  return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ);
-}
-inline Attribute linearId0(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx,
-                                              mlir::gpu::MappingId::LinearDim0);
-}
-inline Attribute linearId1(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx,
-                                              mlir::gpu::MappingId::LinearDim1);
-}
-inline Attribute linearId2(MLIRContext *ctx) {
-  return mlir::gpu::GPUThreadMappingAttr::get(ctx,
-                                              mlir::gpu::MappingId::LinearDim2);
-}
-
-//===----------------------------------------------------------------------===//
-// General helpers.
-//===----------------------------------------------------------------------===//
-static constexpr int64_t kCudaMaxVectorLoadBitWidth = 128;
-
-/// Return max(1, (value * 32) / bitWidth).
-int64_t scaleUpByBitWidth(int64_t value, int64_t bitWidth);
-
-/// Adjust the number of warps to use to benefit from packing multiple smaller
-/// elemental types within a single 128 bit shuffled element.
-int64_t adjustNumberOfWarpsForBlockShuffle(int64_t numWarpsToUse,
-                                           int64_t bitWidth);
-
-//===----------------------------------------------------------------------===//
-// Low-level reusable retargetable builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-/// Post-bufferization mapping to blocks and threads.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-/// Takes an optional `subgroupSize` argument to specify the number of threads
-/// per subgroup.
-Value buildMapToBlockAndThreads(
-    ImplicitLocOpBuilder &b, Value funcH, ArrayRef<int64_t> blockSize,
-    std::optional<int64_t> subgroupSize = std::nullopt);
-
-/// Post-bufferization vector distribution with rank-reduction.
-/// Takes a handle to a func.func and returns an updated handle to a
-/// func.func.
-Value buildDistributeVectors(ImplicitLocOpBuilder &b, Value variantH,
-                             Value funcH, int64_t warpSize);
-
-/// Take care of the last common steps in a GPU strategy (i.e. vectorize,
-/// bufferize, maps to blocks and threads and distribute vectors).
-/// Return the handles to the updated variant and the function ops under
-/// the variant op.
-// TODO: abstract away AbstractReductionStrategy, this is supposed to be
-// retargetable.
-std::pair<Value, Value>
-buildCommonTrailingStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                            ArrayRef<int64_t> numThreadsInBlock);
-
-//===----------------------------------------------------------------------===//
-// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
-//===----------------------------------------------------------------------===//
-/// Take a handle `opH` to a Linalg op of rank `rank`, sizes `opSizes` and for
-/// which we know the most minor dimension `mostMinorDim` (assuming all accesses
-/// are contiguous along that dimension for now).
-/// Build a schedule that maps `mostMinorDim` to a `scf.forall` op.
-/// When `numThreads` > 1, the `scf.forall` is also mapped to
-/// `mappingAttr` (which must then be non-null).
-/// The constructed schedule first performs a split of the largest possible
-/// multiple of `numThreads * maxVectorSize` to form a maximally divisible
-/// region.
-// TODO: More robustness wrt selecting the most minor dimension otherwise
-// performance may suffer.
-// TODO: Split point should be dynamic and aware of future stride / alignment
-// to also guarantee proper vector alignments. OTOH this is a non-trivial bump
-// in schedule complexity and can be handled with simple padding of the
-// underlying allocation.
-void build1DSplittingStrategyWithOptionalThreadMapping(
-    ImplicitLocOpBuilder &b, Value variantH, Value opH, int64_t rank,
-    int64_t mostMinorDim, SmallVector<int64_t> opSizes, int64_t numThreads,
-    Attribute mappingAttr = Attribute(), int64_t maxVectorSize = 4);
-
-/// Build transform IR to hoist the padded output operand of a padded matmul.
-/// Additionally, this attempts to fold the padding into the producing fill, if
-/// available.
-// TODO: Generalize, this is not specific to a matmul.
-// TODO: Better API
-Value buildHoistOutputPaddingOp(ImplicitLocOpBuilder &b, Value variantH,
-                                Value paddedMatmulOpH,
-                                int64_t numLoopsToHoist = 1);
-
-/// Helper function to distribute one pad or copy operation with specified num
-/// threads.
-/// Note: When `foldIfBranch` is true, one must later perform masked
-/// vectorization of the result.
-/// This amounts to injecting knowledge about future transformations without
-/// adding leaky semantics.
-Value buildDistributeOnePadOrCopyWithNumThreads(
-    ImplicitLocOpBuilder &b, Value variantH, Value copyOpH,
-    ArrayRef<int64_t> numThreads, ArrayRef<Attribute> threadDimMapping,
-    bool foldIfBranch = false);
-
-/// Helper function to distribute one pad or copy operation with specified tile
-/// sizes.
-/// Note: When `foldIfBranch` is true, one must later perform masked
-/// vectorization of the result.
-/// This amounts to injecting knowledge about future transformations without
-/// adding leaky semantics.
-std::tuple<Value, Value> buildDistributeOnePadOrCopyWithTileSizes(
-    ImplicitLocOpBuilder &b, Value variantH, Value copyOpH,
-    ArrayRef<int64_t> tileSizes, ArrayRef<Attribute> threadDimMapping,
-    bool foldIfBranch = false);
-
-/// Distribute the explicit copies involved in a matmul operation
-/// `paddedMatmulOpH`.
-std::tuple<Value, Value, Value>
-buildDistributeMatmulCopies(ImplicitLocOpBuilder &b, Value variantH,
-                            Value paddedMatmulOpH,
-                            const AbstractGemmLikeStrategy &strategy);
-
-/// Specific pattern to perform masked vectorization of copies give as
-/// parameters, cleanup and vectorize the rest.
-void buildMatmulVectorization(ImplicitLocOpBuilder &b, Value variantH,
-                              Value lhsCopyOpH, Value rhsCopyOpH,
-                              Value copyBackOpH,
-                              const AbstractGemmLikeStrategy &strategy,
-                              bool vectorizePadding = false,
-                              bool vectorizeNdExtract = false);
-
-/// Build the transform IR to perform conversion to tensor core operations.
-/// This is currently subject to phase orderings as follows:
-///   - Vector transfer_read and transfer_write patterns have different subview
-///     folding behavior, force a fold_memref_aliases on them to enable
-///     redundant vector transfer hoisting.
-///   - Unfortunately, fold_memref_aliases breaks vector_to_mma conversion
-///     across scf.for after unrolling due to insert_strided_slice /
-///     extract_strided_slice across iter_args boundaries.
-///   - Hoist redundant vector transfers to allow conversion to tensor core to
-///     proceed. We really don't want to do this after bufferization but we need
-///     to atm.
-Value buildConvertToTensorCoreOp(ImplicitLocOpBuilder &b, Value funcH,
-                                 const AbstractGemmLikeStrategy &strategy);
-
-void buildMultiBuffering(ImplicitLocOpBuilder &b, Value funcH,
-                         const AbstractGemmLikeStrategy &strategy);
-
-Value buildConvertToAsyncCopies(ImplicitLocOpBuilder &b, Value funcH,
-                                const AbstractGemmLikeStrategy &strategy);
-
-void buildPipelineSharedMemoryCopies(ImplicitLocOpBuilder &b, Value funcH,
-                                     const AbstractGemmLikeStrategy &strategy);
-
-Value buildBufferize(ImplicitLocOpBuilder &b, Value variantH);
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp
deleted file mode 100644
index 0af1afac3110..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/IR/TransformTypes.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
-
-// TODO: significantly better namespacing.
-using iree_compiler::buildPad;
-using iree_compiler::buildSelectFirstNonEmpty;
-using iree_compiler::buildTileFuseDistToForallWithNumThreads;
-using iree_compiler::buildTileFuseDistToForallWithTileSizes;
-using iree_compiler::TileToForallAndFuseAndDistributeResult;
-using iree_compiler::gpu::buildBufferize;
-using iree_compiler::gpu::buildConvertToAsyncCopies;
-using iree_compiler::gpu::buildConvertToTensorCoreOp;
-using iree_compiler::gpu::buildDistributeMatmulCopies;
-using iree_compiler::gpu::buildHoistOutputPaddingOp;
-using iree_compiler::gpu::buildMatmulVectorization;
-using iree_compiler::gpu::buildMultiBuffering;
-using iree_compiler::gpu::buildPipelineSharedMemoryCopies;
-using iree_compiler::gpu::ImplicitGemmStrategy;
-using iree_compiler::gpu::MappingInfo;
-using iree_compiler::gpu::scaleUpByBitWidth;
-using iree_compiler::IREE::transform_dialect::ApplyBubbleCollapsePatternsOp;
-using iree_compiler::IREE::transform_dialect::
-    ApplyFoldReshapeIntoTensorHalInterfacePatternsOp;
-using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
-using iree_compiler::IREE::transform_dialect::
-    PopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
-using transform::ConvertConv2DToImg2ColOp;
-using transform::FuseIntoContainingOp;
-using transform::MatchOp;
-using transform::TileUsingForOp;
-using transform_ext::RegisterMatchCallbacksOp;
-
-/// Options to set the default values of the matmul strategy.
-
-void ImplicitGemmStrategy::initDefaultValues(const GPUModel &gpuModel) {
-  assert(captures.convolutionDims.outputChannel.size() >= 1 &&
-         "requires at least one output channel dimension");
-  assert(captures.convolutionDims.inputChannel.size() >= 1 &&
-         "requires at least one input channel dimension");
-  assert(captures.convolutionDims.outputImage.size() >= 1 &&
-         "requires at least one output image dimension");
-  assert(captures.convolutionDims.filterLoop.size() >= 1 &&
-         "requires at least one filter loop dimension");
-
-  // It is an NCHW conv if the output channel precedes the output image
-  // dimensions.
-  // TODO: This should be inferred directly from the shape of the input (i.e.
-  // input indexing map) rather than overall iterator classes.
-  filterLHS = captures.convolutionDims.outputChannel[0] <
-              captures.convolutionDims.outputImage[0];
-
-  int64_t channelSize = 1;
-  for (auto dim : captures.convolutionDims.outputChannel)
-    channelSize *= captures.convolutionOpSizes[dim];
-  int64_t imageSize = 1;
-  for (auto dim : captures.convolutionDims.outputImage)
-    imageSize *= captures.convolutionOpSizes[dim];
-
-  derivedN = channelSize;
-  derivedM = imageSize;
-  if (filterLHS)
-    std::swap(derivedM, derivedN);
-
-  derivedK = 1;
-  for (auto dim : captures.convolutionDims.filterLoop)
-    derivedK *= captures.convolutionOpSizes[dim];
-  for (auto dim : captures.convolutionDims.inputChannel)
-    derivedK *= captures.convolutionOpSizes[dim];
-
-  // TODO: Capture input/output element types properly for configuring the
-  // padding values.
-  paddingValueTypes = {captures.inputElementType, captures.filterElementType,
-                       captures.outputElementType};
-  paddingDimensions = {0, 1, 2, 3};
-  // TODO: Re-enable once padding works with the img2col op.
-  packingDimensions =
-      filterLHS ? SmallVector<int64_t>{1, 0, 1} : SmallVector<int64_t>{0, 1, 1};
-
-  // Pull in tile configs from flags.
-  AbstractGemmLikeStrategy::initDefaultValues(gpuModel);
-
-  // TODO: Enable async-copies and pipelining
-  useAsyncCopies = false;
-  pipelineDepth = 0;
-}
-
-LLVM_DUMP_METHOD void ImplicitGemmStrategy::dump() const {
-  print(llvm::errs());
-}
-
-void ImplicitGemmStrategy::print(llvm::raw_ostream &os) const {
-  os << "\n--- Implicit GEMM strategy ---\n";
-  os << "- derived problem shape (MNK): " << m() << ", " << n() << ", " << k()
-     << '\n';
-  os << "- convolution dim types: \n";
-  llvm::interleaveComma(captures.convolutionDims.batch, os << "Batch: ");
-  os << "\n";
-  llvm::interleaveComma(captures.convolutionDims.outputImage,
-                        os << "OutputImage: ");
-  os << "\n";
-  llvm::interleaveComma(captures.convolutionDims.outputChannel,
-                        os << "OutputChannel: ");
-  os << "\n";
-  llvm::interleaveComma(captures.convolutionDims.filterLoop,
-                        os << "FilterLoop: ");
-  os << "\n";
-  llvm::interleaveComma(captures.convolutionDims.inputChannel,
-                        os << "InputChannel: ");
-  os << "\n";
-  llvm::interleaveComma(captures.convolutionDims.depth, os << "Depth: ");
-  os << "\n";
-  AbstractGemmLikeStrategy::print(os);
-}
-
-LogicalResult ImplicitGemmStrategy::validate(const GPUModel &gpuModel) const {
-  // First validate the parent strategy.
-  if (failed(AbstractGemmLikeStrategy::validate(gpuModel)))
-    return failure();
-
-  if (batch() < blockTileBatch()) {
-    return emitError(UnknownLoc::get(ctx))
-           << "batch( " << batch() << ") <  blockTileBatch(" << blockTileBatch()
-           << ") this is at risk of not vectorizing and is NYI";
-  }
-
-  if (blockTileSizes.size() < 3) {
-    LDBG("--Not enough block tile sizes\n");
-    return failure();
-  }
-
-  if (numWarps.size() < 3) {
-    LDBG("--Not enough num warps\n");
-    return failure();
-  }
-
-  if (numThreads.size() < 3) {
-    LDBG("--Not enough num threads\n");
-    return failure();
-  }
-
-  if (useFma)
-    return success();
-
-  // Currently unrolling is problematic without a unit batch. Fail for now.
-  if (blockTileBatch() != 1) {
-    LDBG("--Batch tile size must be 1 for tensor core strategies\n");
-    return failure();
-  }
-
-  Type lhsElementType = captures.inputElementType;
-  Type rhsElementType = captures.filterElementType;
-  Type resElementType = captures.outputElementType;
-  if (!lhsElementType.isF32() || !rhsElementType.isF32() ||
-      !resElementType.isF32()) {
-    LDBG("--Tensorcore implicit gemm strategy only supported for f32: "
-         << lhsElementType << ", " << rhsElementType << ", " << resElementType);
-    return failure();
-  }
-  if (lhsElementType != rhsElementType) {
-    LDBG("--Tensorcore implicit gemm strategy mixed input types unsupported\n");
-    return failure();
-  }
-
-  return success();
-}
-
-static std::tuple<Value, Value, Value, Value, Value>
-buildConvolutionStrategyBlockDistribution(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const ImplicitGemmStrategy &strategy) {
-  // Step 1. Call the matcher. Note that this is the same matcher as used to
-  // trigger this compilation path, so it must always apply.
-  b.create<RegisterMatchCallbacksOp>();
-  auto [fillH, convolutionH, maybeTrailingH] = unpackRegisteredMatchCallback<3>(
-      b, "convolution", transform::FailurePropagationMode::Propagate, variantH);
-
-  // Step 2. Do Img2Col on the convolution to get the GEMM + img2col op.
-  Type convType = convolutionH.getType();
-  auto conv2DToImg2Col = b.create<ConvertConv2DToImg2ColOp>(
-      TypeRange{convType, convType}, convolutionH);
-  Value img2colH = conv2DToImg2Col.getImg2colTensor();
-  Value transformedH = conv2DToImg2Col.getTransformed();
-
-  // The matmul is the producer of the transformed handle (expand back to
-  // convolution shape).
-  Value matmulH = b.create<transform::GetProducerOfOperand>(
-      transformedH.getType(), transformedH, 0);
-
-  // Bubble the expand_shape from img2col through the trailing elementwise
-  Value funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<ApplyBubbleCollapsePatternsOp>(loc);
-  });
-
-  // Step 3. Create the block/mapping tiling level and fuse.
-  auto [fusionTargetH, fusionGroupH] =
-      buildSelectFirstNonEmpty(b, maybeTrailingH, matmulH);
-  MappingInfo blockMapping = strategy.getBlockMapping();
-  TileToForallAndFuseAndDistributeResult tileResult =
-      buildTileFuseDistToForallWithTileSizes(
-          /*builder=*/b,
-          /*isolatedParentOpH=*/variantH,
-          /*rootH=*/fusionTargetH,
-          /*opsToFuseH=*/fusionGroupH,
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
-          /*threadDimMapping=*/
-          b.getArrayAttr(blockMapping.threadMapping));
-
-  // Handle the workgroup count region.
-  b.create<PopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
-      tileResult.forallH);
-
-  // Rematch the fill because earlier handle is invalidated.
-  Value newFillH =
-      b.create<MatchOp>(variantH, linalg::FillOp::getOperationName());
-  fillH =
-      b.create<FuseIntoContainingOp>(newFillH, tileResult.forallH).getResult(0);
-
-  Value tiledImg2colH =
-      b.create<FuseIntoContainingOp>(img2colH, tileResult.forallH).getResult(0);
-
-  auto [blockMatmulH, maybeBlockTrailingH] = buildSelectFirstNonEmpty(
-      b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH);
-
-  // TODO: handle trailing op.
-  return std::make_tuple(fillH, tiledImg2colH, blockMatmulH,
-                         maybeBlockTrailingH, tileResult.forallH);
-}
-
-// TODO: Merge with buildTileFuseToScfFor.
-static mlir::iree_compiler::TileToScfForAndFuseResult
-buildTileFuseToSingleScfFor(ImplicitLocOpBuilder &b, Value isolatedParentOpH,
-                            Value rootH, Value opHToFuse,
-                            ArrayRef<int64_t> tileSizes) {
-  iree_compiler::TileToScfForAndFuseResult result;
-  Type rootType = rootH.getType();
-  auto tiletoScfForOp = b.create<TileUsingForOp>(rootType, rootH, tileSizes);
-  result.forLoops = tiletoScfForOp.getLoops();
-  result.tiledOpH = tiletoScfForOp.getTiledLinalgOp();
-
-  assert(result.forLoops.size() == 1 && "More than one loop");
-
-  // TODO: Allow fusing more than one op.
-  b.create<FuseIntoContainingOp>(opHToFuse, result.forLoops[0]);
-  // Avoid canonicalization for now to avoid prematurely folding away the pad
-  // ops.
-  return result;
-}
-
-void iree_compiler::gpu::buildConvolutionImplicitGemmStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const ImplicitGemmStrategy &strategy) {
-  LLVM_DEBUG(strategy.print(DBGS()));
-
-  // Step 1. Apply block-level part of the strategy, keeps everything fused.
-  auto [fillH, img2colH, matmulH, maybeTiledTrailingHBlock, forall] =
-      buildConvolutionStrategyBlockDistribution(b, variantH, strategy);
-  // Tile reduction loop.
-  SmallVector<int64_t> tileSizes{0, 0, 0, strategy.reductionTileSize};
-  auto tileReductionResult =
-      buildTileFuseToSingleScfFor(b, variantH, matmulH, img2colH, tileSizes);
-
-  // Step 2. Pad the matmul op.
-  auto paddedMatmulOpH =
-      buildPad(b, tileReductionResult.tiledOpH,
-               strategy.getZeroPadAttrFromElementalTypes(b).getValue(),
-               strategy.paddingDimensions, strategy.packingDimensions);
-
-  // Step 3. Hoist the padding of the output operand above the reduction loop.
-  // The resulting fillOp will be mapped with the contraction using an SIMD
-  // programming model.
-  Value fillOpH;
-  if (!strategy.alignedRes()) {
-    fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedMatmulOpH);
-  } else {
-    fillOpH = b.create<transform::MatchOp>(variantH,
-                                           linalg::FillOp::getOperationName());
-  }
-
-  Value funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-
-  // Step 4. Distribute pad and copies: SIMT programming model.
-  auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] =
-      buildDistributeMatmulCopies(b, variantH, paddedMatmulOpH, strategy);
-
-  // Step 5. Distribute to warps: SIMD programming model.
-  // TODO: get the number of warps from strategy.
-  MappingInfo computeMapping = strategy.computeMapping();
-  buildTileFuseDistToForallWithNumThreads(
-      b, variantH, paddedMatmulOpH, ValueRange(),
-      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
-      b.getArrayAttr(computeMapping.threadMapping));
-  buildTileFuseDistToForallWithNumThreads(
-      b, variantH, fillOpH, ValueRange(),
-      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
-      b.getArrayAttr(computeMapping.threadMapping));
-
-  // Step 6. Rank-reduce and vectorize.
-  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
-    b.create<ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
-    b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
-    b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
-  });
-  buildMatmulVectorization(b, variantH, lhsCopyOpH, rhsCopyOpH, copyBackOpH,
-                           strategy, /*vectorizePadding=*/false,
-                           /*vectorizeNdExtract=*/true);
-
-  // Step 7. Bufferize and drop HAL descriptor from memref ops.
-  variantH = buildBufferize(b, variantH);
-
-  // Step 8. Post-bufferization mapping to blocks and threads.
-  // Need to match again since bufferize invalidated all handles.
-  // TODO: assumes a single func::FuncOp to transform, needs hardening.
-  // TODO: extract info from strategy.
-  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  funcH = buildMapToBlockAndThreads(b, funcH, strategy.numThreads);
-  funcH = b.create<EliminateGpuBarriersOp>(funcH);
-
-  // Step 9. Convert to tensor core ops.
-  // TODO: avoid consuming handles and returning here.
-  funcH = buildConvertToTensorCoreOp(b, funcH, strategy);
-
-  // TODO: Enable async copies/multibuffering/pipelining.
-
-  // Step 10. Late lowerings and cleanups.
-  buildLowerVectorMasksAndCleanup(b, funcH);
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h
deleted file mode 100644
index e17e115abd2f..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_
-
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct GPUModel;
-
-class ImplicitGemmStrategy : public AbstractGemmLikeStrategy {
-public:
-  ImplicitGemmStrategy(
-      MLIRContext *context,
-      const transform_ext::MatchedConvolutionCaptures &captures,
-      const GPUModel &gpuModel)
-      : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) {
-    initDefaultValues(gpuModel);
-  }
-
-  ImplicitGemmStrategy(const ImplicitGemmStrategy &) = default;
-  ImplicitGemmStrategy &operator=(const ImplicitGemmStrategy &) = default;
-
-  /// Constructor quantities.
-  MLIRContext *ctx;
-  transform_ext::MatchedConvolutionCaptures captures;
-
-  /// Initialize values from the CLI. Set cliOptionsSpecified to true if the
-  /// default CLI values have been overriden.
-  void initDefaultValues(const GPUModel &gpuModel) override;
-
-  LogicalResult validate(const GPUModel &gpuModel) const override;
-
-  int64_t batch() const { return captures.convolutionOpSizes[0]; }
-  int64_t m() const override { return derivedM; }
-  int64_t n() const override { return derivedN; }
-  int64_t k() const override { return derivedK; }
-
-  /// Named accessors to block tile sizes associated with shapes.
-  int64_t blockTileBatch() const { return blockTileSizes[0]; }
-  int64_t blockTileM() const override { return blockTileSizes[1]; }
-  int64_t blockTileN() const override { return blockTileSizes[2]; }
-
-  /// Number of threads to use.
-  int64_t numThreadsX() const { return numThreads[0]; }
-  int64_t numThreadsY() const { return numThreads[1]; }
-  int64_t numThreadsZ() const { return numThreads[2]; }
-
-  /// Number of warps to use.
-  int64_t numWarpsX() const override { return numWarps[0]; }
-  int64_t numWarpsY() const override { return numWarps[1]; }
-  int64_t numWarpsZ() const { return numWarps[2]; }
-
-  Type getLhsElementalType() const override {
-    return filterLHS ? captures.filterElementType : captures.inputElementType;
-  }
-  Type getRhsElementalType() const override {
-    return filterLHS ? captures.inputElementType : captures.filterElementType;
-  }
-  Type getResElementalType() const override {
-    return captures.outputElementType;
-  }
-
-  MappingInfo getBlockMapping() const override {
-    // 2D named convolutions are always batched.
-    return MappingInfo{
-        /*numThreads=*/{},
-        /*tileSizes=*/{blockTileBatch(), blockTileM(), blockTileN()},
-        /*threadMapping=*/{blockZ(ctx), blockY(ctx), blockX(ctx)}};
-  }
-
-  // LHS copy is of size (batch) x M x K.
-  MappingInfo lhsCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/k(),
-        /*copySizes=*/
-        filterLHS ? ArrayRef<int64_t>{blockTileM(), reductionTileSize}
-                  : ArrayRef<int64_t>{blockTileBatch(), blockTileM(),
-                                      reductionTileSize},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/lhsElementalBitWidth());
-  }
-
-  // RHS copy is of size (batch) x K x N.
-  MappingInfo rhsCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/n(),
-        /*copySizes=*/
-        filterLHS ? ArrayRef<int64_t>{blockTileBatch(), reductionTileSize,
-                                      blockTileN()}
-                  : ArrayRef<int64_t>{reductionTileSize, blockTileN()},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/rhsElementalBitWidth());
-  }
-
-  // RES copy is of size batch x M x N.
-  MappingInfo resCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/n(),
-        /*copySizes=*/{blockTileBatch(), blockTileM(), blockTileN()},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/resElementalBitWidth());
-  }
-
-  /// Check that the mapping computed for a copy is valid.
-  LogicalResult validateLhsCopyMapping() const override {
-    return validateCopyMapping(ctx, lhsCopyMapping(), "lhs");
-  }
-  LogicalResult validateRhsCopyMapping() const override {
-    return validateCopyMapping(ctx, rhsCopyMapping(), "rhs");
-  }
-  LogicalResult validateResCopyMapping() const override {
-    return validateCopyMapping(ctx, resCopyMapping(), "result");
-  }
-
-  // COMPUTE is of size batch x M x N.
-  MappingInfo computeMapping() const override {
-    if (useFma) {
-      return MappingInfo{
-          /*numThreads=*/{numThreadsZ(), numThreadsY(), numThreadsX()},
-          /*tileSizes=*/{},
-          /*threadMapping=*/{threadZ(ctx), threadY(ctx), threadX(ctx)},
-          /*vectorSize=*/std::nullopt};
-    }
-    return MappingInfo{/*numThreads=*/{numWarpsZ(), numWarpsY(), numWarpsX()},
-                       /*tileSizes=*/{},
-                       /*threadMapping=*/{warpZ(ctx), warpY(ctx), warpX(ctx)},
-                       /*vectorSize=*/std::nullopt};
-  }
-
-  void print(llvm::raw_ostream &os) const override;
-  LLVM_DUMP_METHOD void dump() const override;
-
-private:
-  // For NCHW convolutions, the filter will be the LHS of the GEMM.
-  bool filterLHS = false;
-
-  int64_t derivedM = 0;
-  int64_t derivedN = 0;
-  int64_t derivedK = 0;
-};
-
-void buildConvolutionImplicitGemmStrategy(ImplicitLocOpBuilder &b,
-                                          Value variantH,
-                                          const ImplicitGemmStrategy &strategy);
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp
deleted file mode 100644
index 7d646303bec6..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-gpu-copy-mapping"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
-
-int64_t iree_compiler::gpu::CopyMapping::maxContiguousElementsToTransfer(
-    int64_t alignment, int64_t numContiguousElements,
-    int64_t elementalBitWidth) {
-  assert(kCudaMaxVectorLoadBitWidth % elementalBitWidth == 0 &&
-         "elemental bitwidth does not divide kCudaMaxVectorLoadBitWidth");
-  return std::gcd(std::gcd(alignment, numContiguousElements),
-                  kCudaMaxVectorLoadBitWidth / elementalBitWidth);
-}
-
-FailureOr<iree_compiler::gpu::CopyMapping>
-iree_compiler::gpu::CopyMapping::numThreadsForCopy(int totalNumThreads,
-                                                   int64_t alignment,
-                                                   ArrayRef<int64_t> sizes,
-                                                   bool favorPredication,
-                                                   int64_t elementalBitWidth) {
-  LDBG("\nSTART numThreadsForCopy, favorPredication: " << favorPredication);
-  LLVM_DEBUG(llvm::interleaveComma(sizes, DBGS() << "--sizes: ");
-             llvm::dbgs() << "\n";);
-
-  // Greedily find the largest vector size that can be used to copy the most
-  // minor dimension: we are in the business of filling 128B contiguous memory
-  // transactions with as few threads as possible.
-  int64_t maxVectorSize = CopyMapping::maxContiguousElementsToTransfer(
-      alignment, sizes.back(), elementalBitWidth);
-  LDBG("--maxVectorSize: " << maxVectorSize);
-  int64_t numElements = 1;
-  for (auto s : sizes)
-    numElements *= s;
-  LDBG("--numElements: " << numElements);
-
-  int64_t actualVectorSize = maxVectorSize;
-  if (!favorPredication) {
-    // Bias towards reducing the vector size to avoid predication.
-    // Predication occurs if we end up using fewer than totalNumThreads for a
-    // particular copy.
-    // Predication chokes the current implementation of shared memory
-    // pipelining.
-    // TODO: Reevaluate this heuristic when we have a more robust pipelining
-    // implementation.
-    for (; actualVectorSize >= 1; actualVectorSize /= 2) {
-      LDBG("--step totalNumThreads * actualVectorSize: "
-           << totalNumThreads * actualVectorSize);
-      if (numElements % (totalNumThreads * actualVectorSize) != 0)
-        continue;
-      break;
-    }
-    LDBG("--numElements: " << numElements);
-    LDBG("--totalNumThreads: " << totalNumThreads);
-    LDBG("--actualVectorSize: " << actualVectorSize);
-    if (actualVectorSize == 0) {
-      LDBG("--Could not map copy without predication -> FAIL");
-      return failure();
-    }
-  }
-
-  // Scale back the last size by actualVectorSize to account for the fact
-  // that we perform vector transfers.
-  assert(sizes.back() % actualVectorSize == 0 &&
-         "most-minor size not divisible by actualVectorSize");
-  SmallVector<int64_t> scaledSizes{sizes.begin(), sizes.end()};
-  scaledSizes.back() /= actualVectorSize;
-
-  int64_t numThreadsRemaining = totalNumThreads;
-  LDBG("--numThreadsRemaining: " << numThreadsRemaining);
-  SmallVector<int64_t> factors;
-  for (auto s : llvm::reverse(scaledSizes)) {
-    int64_t gcd = std::gcd(numThreadsRemaining, s);
-    factors.push_back(gcd);
-    numThreadsRemaining /= gcd;
-    LDBG("--new factors: " << gcd);
-    LDBG("--numThreadsRemaining: " << numThreadsRemaining);
-  }
-
-  std::reverse(factors.begin(), factors.end());
-
-  LLVM_DEBUG(llvm::interleaveComma(factors, DBGS() << "numThreads: ");
-             llvm::dbgs() << "\n";
-             LDBG("actualVectorSize: " << actualVectorSize););
-
-  return CopyMapping{actualVectorSize, factors};
-}
-
-iree_compiler::gpu::MappingInfo iree_compiler::gpu::CopyMapping::getMappingInfo(
-    MLIRContext *ctx, int totalNumThreads, int64_t alignment,
-    ArrayRef<int64_t> copySizes, bool favorPredication,
-    int64_t elementalBitWidth) {
-  assert(!copySizes.empty() && copySizes.size() <= 3 &&
-         "only 1,2,3-D copies are supported for now");
-  FailureOr<CopyMapping> maybeCopyMapping =
-      CopyMapping::numThreadsForCopy(totalNumThreads, alignment, copySizes,
-                                     favorPredication, elementalBitWidth);
-  // If failed, try again with predication; this must succeed.
-  if (failed(maybeCopyMapping)) {
-    assert(!favorPredication &&
-           "maybe copy mapping may not fail with predication");
-    maybeCopyMapping = CopyMapping::numThreadsForCopy(
-        totalNumThreads, alignment, copySizes, /*favorPredication=*/true,
-        elementalBitWidth);
-  }
-  assert(succeeded(maybeCopyMapping) && "failed to compute copy mapping");
-  assert(maybeCopyMapping->numThreads.size() == copySizes.size() &&
-         "compute copy mapping expected same number of threads and copy sizes");
-
-  SmallVector<int64_t> tileSizes = llvm::to_vector(llvm::map_range(
-      llvm::zip(copySizes, maybeCopyMapping->numThreads), [](auto &&pair) {
-        int64_t size, numThreads;
-        std::tie(size, numThreads) = pair;
-        return llvm::divideCeilSigned(size, numThreads);
-      }));
-  SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
-                                           linearId0(ctx)};
-  auto threadMapping =
-      llvm::to_vector(ArrayRef(allThreadMappings).take_back(tileSizes.size()));
-
-  MappingInfo res{/*numThreads=*/maybeCopyMapping->numThreads,
-                  /*tilecopySizes=*/tileSizes,
-                  /*threadMapping=*/threadMapping,
-                  /*vectorSize=*/maybeCopyMapping->vectorSize};
-  LLVM_DEBUG(res.print(DBGS()); llvm::dbgs() << "\n");
-  return res;
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h
deleted file mode 100644
index a72f4d456088..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h
+++ /dev/null
@@ -1,81 +0,0 @@
-
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_
-
-#include <numeric>
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct CopyMapping {
-  /// Vector size to use for the copy.
-  int64_t vectorSize;
-
-  /// numThreads to use for the copy mapping, from most major to most minor dims
-  /// (i.e. numThreads.back() should be mapped to contiguous threads for best
-  /// coalescing).
-  SmallVector<int64_t> numThreads;
-
-  /// Determine the maximal vector size to use to copy a contiguous array of
-  /// `numContiguousElements`, each of bitwidth `elementalBitWidth`.
-  /// The `alignment` is the number of elements by which the most minor
-  /// dimension of the copy is aligned. This is an approximation of actual
-  /// memory alignment after bufferization, for each row of the copy. This is
-  /// used to restrict the of the copied vector so that it is properly aligned
-  /// with the requirements of cp.async. If the copy alignemnt does not match
-  /// the required aligned for a cp.async, thae conversion to cp.async will be
-  /// skipped.
-  /// Asserts that `elementalBitWidth` divides `numContiguousElements`.
-  static int64_t
-  maxContiguousElementsToTransfer(int64_t alignment,
-                                  int64_t numContiguousElements,
-                                  int64_t elementalBitWidth = 32);
-
-  /// Compute the number of threads to use to perform a copy of `sizes`
-  /// elements of `elementalBitWidth`.
-  /// The `alignment` is the number of elements by which the most minor
-  /// dimension of the copy is aligned. This is an approximation of actual
-  /// memory alignment after bufferization, for each row of the copy. This is
-  /// used to restrict the of the copied vector so that it is properly aligned
-  /// with the requirements of cp.async. If the copy alignemnt does not match
-  /// the required aligned for a cp.async, thae conversion to cp.async will be
-  /// skipped.
-  /// When `favorPredication` is false, the implementation avoids predication in
-  /// the copy, even if it means reducing the granularity of the transfer.
-  /// Otherwise, the implementation will come up with a best-effort predicated
-  /// mapping that respects the maximal vector transfer size.
-  static FailureOr<CopyMapping>
-  numThreadsForCopy(int totalNumThreads, int64_t alignment,
-                    ArrayRef<int64_t> sizes, bool favorPredication,
-                    int64_t elementalBitWidth = 32);
-
-  /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
-  /// elements of bitwidth `elementalBitWidth`.
-  /// The `alignment` is the number of elements by which the most minor
-  /// dimension of the copy is aligned. This is an approximation of actual
-  /// memory alignment after bufferization, for each row of the copy. This is
-  /// used to restrict the of the copied vector so that it is properly aligned
-  /// with the requirements of cp.async. If the copy alignemnt does not match
-  /// the required aligned for a cp.async, thae conversion to cp.async will be
-  /// skipped. When `favorPredication` if false, the mapping is computed to fill
-  /// all threads with an equal amount of data to copy, so as to avoid
-  /// predication. Predication often ends up breaking current pipelining
-  /// implementations down the line and is generally discouraged. At the moment,
-  /// asserts that sizes has exactly 2 entries.
-  static MappingInfo getMappingInfo(MLIRContext *ctx, int totalNumThreads,
-                                    int64_t alignment, ArrayRef<int64_t> sizes,
-                                    bool favorPredication = false,
-                                    int64_t elementalBitWidth = 32);
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp
deleted file mode 100644
index 8d1c2c3eb108..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-
-void mlir::iree_compiler::gpu::MappingInfo::print(llvm::raw_ostream &os) const {
-  os << "MappingInfo{";
-  os << "vectorSize: " << ((vectorSize.has_value()) ? vectorSize.value() : 0);
-  llvm::interleaveComma(numThreads, os << ", numThreads: {");
-  llvm::interleaveComma(tileSizes, os << "}, tileSizes: {");
-  llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
-  os << "}}";
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h
deleted file mode 100644
index 806da4234a27..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h
+++ /dev/null
@@ -1,29 +0,0 @@
-
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_
-
-#include "mlir/IR/Attributes.h"
-
-namespace mlir::iree_compiler::gpu {
-
-/// Helper struct to hold the mapping information for a given operation.
-struct MappingInfo {
-  SmallVector<int64_t> numThreads;
-  // Note: explicitly computing the tileSizes is only needed until masked
-  // vectorization properly computes the bounds automatically.
-  SmallVector<int64_t> tileSizes;
-  SmallVector<Attribute> threadMapping;
-  std::optional<int64_t> vectorSize;
-  void print(llvm::raw_ostream &os) const;
-  LLVM_DUMP_METHOD void dump() const;
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp
deleted file mode 100644
index 4bb56107fd41..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/IR/TransformTypes.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
-
-// TODO: significantly better namespacing.
-using iree_compiler::buildPad;
-using iree_compiler::buildTileFuseDistToForallWithNumThreads;
-using iree_compiler::buildTileFuseDistToForallWithTileSizes;
-using iree_compiler::TileToForallAndFuseAndDistributeResult;
-using iree_compiler::gpu::BatchMatmulStrategy;
-using iree_compiler::gpu::buildBufferize;
-using iree_compiler::gpu::buildConvertToAsyncCopies;
-using iree_compiler::gpu::buildConvertToTensorCoreOp;
-using iree_compiler::gpu::buildDistributeMatmulCopies;
-using iree_compiler::gpu::buildHoistOutputPaddingOp;
-using iree_compiler::gpu::buildMatmulVectorization;
-using iree_compiler::gpu::buildMultiBuffering;
-using iree_compiler::gpu::buildPipelineSharedMemoryCopies;
-using iree_compiler::gpu::MappingInfo;
-using iree_compiler::gpu::MatmulStrategy;
-using iree_compiler::gpu::scaleUpByBitWidth;
-using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
-using iree_compiler::IREE::transform_dialect::
-    PopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
-using transform::MatchOp;
-using transform_ext::RegisterMatchCallbacksOp;
-
-void MatmulStrategy::initDefaultValues(const GPUModel &gpuModel) {
-  // Set the configuration for padding the matmul.
-  paddingValueTypes = {captures.lhsElementType, captures.rhsElementType,
-                       captures.outputElementType};
-  paddingDimensions = {0, 1, 2};
-  packingDimensions = {1, 1, 1};
-
-  // Pull in tile configs from flags.
-  AbstractGemmLikeStrategy::initDefaultValues(gpuModel);
-}
-
-LLVM_DUMP_METHOD void MatmulStrategy::dump() const { print(llvm::errs()); }
-
-void MatmulStrategy::print(llvm::raw_ostream &os) const {
-  os << "\n--- Matmul strategy ---\n";
-  AbstractGemmLikeStrategy::print(os);
-}
-
-LogicalResult MatmulStrategy::validate(const GPUModel &gpuModel) const {
-  // First validate the parent strategy.
-  if (failed(AbstractGemmLikeStrategy::validate(gpuModel)))
-    return failure();
-
-  // Unlike for wmma/mma, we have no special type requirements for fma.
-  if (useFma)
-    return success();
-
-  Type lhsElementType = captures.lhsElementType;
-  Type rhsElementType = captures.rhsElementType;
-  Type resElementType = captures.outputElementType;
-  if (!lhsElementType.isF32() || !rhsElementType.isF32() ||
-      !resElementType.isF32()) {
-    LDBG("--Tensorcore matmul strategy only supported for f32: "
-         << lhsElementType << ", " << rhsElementType << ", " << resElementType);
-    return failure();
-  }
-  if (lhsElementType != rhsElementType) {
-    LDBG("--Tensorcore matmul strategy mixed input types unsupported\n");
-    return failure();
-  }
-
-  if (useMmaSync) {
-    if (!gpuModel.hasTF32TensorCore) {
-      LDBG("--Matmul strategy target has not TF32 tensor core\n");
-      return failure();
-    }
-
-    if (!gpuModel.hasMmaSync) {
-      LDBG("--Matmul strategy target does not support MMA.SYNC operations\n");
-      return failure();
-    }
-  } else {
-    // Verify WMMA.
-    // Hard coded to reflect current WMMA unrolling support.
-    int reqM = 16;
-    int reqN = 16;
-    int reqK = lhsElementType.isF32() ? 8 : 16;
-    if (llvm::all_of(gpuModel.supportedWMMAConfigs,
-                     [&](iree_compiler::gpu::MMAConfig config) {
-                       return config.m != reqM || config.n != reqN ||
-                              config.k != reqK ||
-                              config.aType != lhsElementType ||
-                              config.bType != rhsElementType ||
-                              config.cType != resElementType;
-                     })) {
-      LDBG("--Matmul strategy failed wmma type check\n");
-      return failure();
-    }
-  }
-  return success();
-}
-
-LogicalResult BatchMatmulStrategy::validate(const GPUModel &gpuModel) const {
-  if (failed(MatmulStrategy::validate(gpuModel))) {
-    return failure();
-  }
-
-  if (batch() < blockTileBatch()) {
-    return emitError(UnknownLoc::get(ctx))
-           << "batch( " << batch() << ") <  blockTileBatch(" << blockTileBatch()
-           << ") this is at risk of not vectorizing and is NYI";
-  }
-
-  // Only single outermost batch dimension is currently supported.
-  if (captures.batches().size() != 1 || captures.batches().back() != 0) {
-    LDBG("--Couldn't find single outermost batch dimension\n");
-    return failure();
-  }
-
-  if (blockTileSizes.size() < 3) {
-    LDBG("--Not enough block tile sizes\n");
-    return failure();
-  }
-
-  if (numWarps.size() < 3) {
-    LDBG("--Not enough num warps\n");
-    return failure();
-  }
-
-  if (numThreads.size() < 3) {
-    LDBG("--Not enough num threads\n");
-    return failure();
-  }
-
-  if (!useFma) {
-    LDBG("--Only FMA is supported for batch matmul atm\n");
-    return failure();
-  }
-
-  return success();
-}
-
-static std::tuple<Value, Value, Value, Value>
-buildMatmulStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH,
-                                     const MatmulStrategy &strategy) {
-  // Step 1. Call the matcher. Note that this is the same matcher as used to
-  // trigger this compilation path, so it must always apply.
-  b.create<RegisterMatchCallbacksOp>();
-  auto [fillH, matmulH, maybeTrailingH] = unpackRegisteredMatchCallback<3>(
-      b, "matmul", transform::FailurePropagationMode::Propagate, variantH);
-
-  // Step 2. Create the block/mapping tiling level and fusee.
-  // auto [fusionTargetH, fusionGroupH] =
-  //     buildSelectFirstNonEmpty(b, maybeTrailingH, matmulH);
-  MappingInfo blockMapping = strategy.getBlockMapping();
-  TileToForallAndFuseAndDistributeResult tileResult =
-      buildTileFuseDistToForallWithTileSizes(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/matmulH,
-          /*opsToFuseH=*/fillH,
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
-          /*threadDimMapping=*/
-          b.getArrayAttr(blockMapping.threadMapping));
-
-  // Handle the workgroup count region.
-  b.create<PopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
-      tileResult.forallH);
-
-  // TODO: handle trailing op.
-  return std::make_tuple(tileResult.resultingFusedOpsHandles.front(),
-                         tileResult.tiledOpH, Value(), tileResult.forallH);
-}
-
-/// Builds the common part of the schedule for matmuls and batched matmuls.
-static void
-buildCommonMatmulLikeThreadSchedule(ImplicitLocOpBuilder &b, Value variantH,
-                                    Value fillH, Value matmulH,
-                                    const MatmulStrategy &strategy) {
-  using mlir::iree_compiler::buildLowerVectorMasksAndCleanup;
-  using mlir::iree_compiler::buildTileFuseToScfFor;
-  using namespace mlir::iree_compiler::gpu;
-
-  // Tile the reduction loop (last in the list).
-  SmallVector<int64_t> tileSizes(strategy.captures.matmulOpSizes.size() - 1, 0);
-  tileSizes.push_back(strategy.reductionTileSize);
-
-  // Avoid canonicalizing before the pad to avoid folding away the extract_slice
-  // on the output needed to hoist the output pad.
-  auto tileReductionResult = buildTileFuseToScfFor(
-      b, variantH, matmulH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
-      /*canonicalize=*/false);
-
-  // Step 2. Pad the (batch) matmul op.
-  auto paddedMatmulOpH =
-      buildPad(b, tileReductionResult.tiledOpH,
-               strategy.getZeroPadAttrFromElementalTypes(b).getValue(),
-               strategy.paddingDimensions, strategy.packingDimensions);
-
-  // Step 3. Hoist the padding of the output operand above the reduction loop.
-  // The resulting fillOp will be mapped with the contraction using an SIMD
-  // programming model.
-  Value fillOpH = fillH;
-  if (!strategy.alignedRes()) {
-    fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedMatmulOpH);
-  }
-
-  // Running canonicalization is required here to enable aligned pads to become
-  // linalg.copy ops when rewriting in DPS.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
-
-  // Step 4. Distribute pad and copies: SIMT programming model.
-  auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] =
-      buildDistributeMatmulCopies(b, variantH, paddedMatmulOpH, strategy);
-
-  // Step 5. Distribute to warps: SIMD programming model.
-  // TODO: get the number of warps from strategy.
-  MappingInfo computeMapping = strategy.computeMapping();
-  buildTileFuseDistToForallWithNumThreads(
-      b, variantH, paddedMatmulOpH, ValueRange(),
-      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
-      b.getArrayAttr(computeMapping.threadMapping));
-  buildTileFuseDistToForallWithNumThreads(
-      b, variantH, fillOpH, ValueRange(),
-      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
-      b.getArrayAttr(computeMapping.threadMapping));
-
-  // Step 6. Rank-reduce and vectorize.
-  buildMatmulVectorization(b, variantH, lhsCopyOpH, rhsCopyOpH, copyBackOpH,
-                           strategy);
-
-  // Step 7. Bufferize and drop HAL descriptor from memref ops.
-  variantH = buildBufferize(b, variantH);
-
-  // Step 8. Post-bufferization mapping to blocks and threads.
-  // Need to match again since bufferize invalidated all handles.
-  // TODO: assumes a single func::FuncOp to transform, needs hardening.
-  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  funcH =
-      buildMapToBlockAndThreads(b, funcH,
-                                /*blockSize=*/strategy.numThreads,
-                                /*subgroupSize=*/strategy.targetSubgroupSize);
-  funcH = b.create<EliminateGpuBarriersOp>(funcH);
-
-  // Step 9. Convert to tensor core ops.
-  // TODO: avoid consuming handles and returning here.
-  funcH = buildConvertToTensorCoreOp(b, funcH, strategy);
-
-  // TODO: Support pipelining strategies without async copy (e.g. store to
-  // shared memory in stage 0).
-  if (strategy.useAsyncCopies) {
-    // Step 10. Multi-buffering.
-    if (strategy.pipelineDepth > 1)
-      buildMultiBuffering(b, funcH, strategy);
-
-    // Step 11. Convert to async copies.
-    // TODO: avoid consuming handles and returning here.
-    funcH = buildConvertToAsyncCopies(b, funcH, strategy);
-
-    // Step 12. Pipeline shared memory copies.
-    if (strategy.pipelineDepth > 1)
-      buildPipelineSharedMemoryCopies(b, funcH, strategy);
-  }
-
-  // Step 13. Late lowerings and cleanups.
-  buildLowerVectorMasksAndCleanup(b, funcH);
-}
-
-void iree_compiler::gpu::buildMatmulTensorCoreStrategy(
-    ImplicitLocOpBuilder &b, Value variantH, const MatmulStrategy &strategy) {
-  LLVM_DEBUG(strategy.print(DBGS()));
-
-  // Step 1. Apply block-level part of the strategy, keeps everything fused.
-  auto [fillH, matmulH, maybeTiledTrailingHBlock, forall] =
-      buildMatmulStrategyBlockDistribution(b, variantH, strategy);
-  buildCommonMatmulLikeThreadSchedule(b, variantH, fillH, matmulH, strategy);
-}
-
-/// Builds the transform dialect operations distributing batch matmul across
-/// blocks according to the given strategy.
-static std::tuple<Value, Value, Value>
-buildBatchMatmulStrategyBlockDistribution(ImplicitLocOpBuilder &b,
-                                          Value variantH,
-                                          const BatchMatmulStrategy &strategy) {
-  b.create<RegisterMatchCallbacksOp>();
-  auto [fillH, bmmH] = unpackRegisteredMatchCallback<2>(
-      b, "batch_matmul", transform::FailurePropagationMode::Propagate,
-      variantH);
-
-  MappingInfo blockMapping = strategy.getBlockMapping();
-  TileToForallAndFuseAndDistributeResult tileResult =
-      buildTileFuseDistToForallWithTileSizes(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/bmmH,
-          /*opsToFuseH=*/fillH,
-          /*tileSizes=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
-          /*threadDimMapping=*/
-          b.getArrayAttr(blockMapping.threadMapping));
-
-  // Handle the workgroup count region.
-  b.create<PopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
-      tileResult.forallH);
-  return std::make_tuple(tileResult.resultingFusedOpsHandles.front(),
-                         tileResult.tiledOpH, tileResult.forallH);
-}
-
-void iree_compiler::gpu::buildBatchMatmulStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const BatchMatmulStrategy &strategy) {
-  LLVM_DEBUG(strategy.print(DBGS()));
-
-  auto [fillH, matmulH, forallH] =
-      buildBatchMatmulStrategyBlockDistribution(b, variantH, strategy);
-  buildCommonMatmulLikeThreadSchedule(b, variantH, fillH, matmulH, strategy);
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h
deleted file mode 100644
index 99c41d05b940..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h
+++ /dev/null
@@ -1,276 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_
-
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct GPUModel;
-
-class MatmulStrategy : public AbstractGemmLikeStrategy {
-public:
-  MatmulStrategy(MLIRContext *context,
-                 const transform_ext::MatchedMatmulCaptures &captures,
-                 const GPUModel &gpuModel)
-      : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) {
-    initDefaultValues(gpuModel);
-  }
-
-  MatmulStrategy(const MatmulStrategy &) = default;
-  MatmulStrategy &operator=(const MatmulStrategy &) = default;
-
-  /// Constructor quantities.
-  MLIRContext *ctx;
-  transform_ext::MatchedMatmulCaptures captures;
-
-  /// Initialize values from the CLI. Set cliOptionsSpecified to true if the
-  /// default CLI values have been overriden.
-  void initDefaultValues(const GPUModel &gpuModel) override;
-
-  LogicalResult validate(const GPUModel &gpuModel) const override;
-
-  int64_t m() const override {
-    assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes");
-    return captures.matmulOpSizes[0];
-  }
-  int64_t n() const override {
-    assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes");
-    return captures.matmulOpSizes[1];
-  }
-  int64_t k() const override {
-    assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes");
-    return captures.matmulOpSizes[2];
-  }
-
-  int64_t blockTileM() const override {
-    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
-    return blockTileSizes[0];
-  }
-  int64_t blockTileN() const override {
-    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
-    return blockTileSizes[1];
-  }
-
-  int64_t numWarpsX() const override {
-    assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
-    return numWarps[0];
-  }
-  int64_t numWarpsY() const override {
-    assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
-    return numWarps[1];
-  }
-
-  Type getLhsElementalType() const override { return captures.lhsElementType; }
-  Type getRhsElementalType() const override { return captures.rhsElementType; }
-  Type getResElementalType() const override {
-    return captures.outputElementType;
-  }
-
-  MappingInfo getBlockMapping() const override {
-    return MappingInfo{/*numThreads=*/{},
-                       /*tileSizes=*/{blockTileM(), blockTileN()},
-                       /*threadMapping=*/{blockY(ctx), blockX(ctx)},
-                       /*vectorSize=*/std::nullopt};
-  }
-
-  // LHS copy is of size mxk.
-  MappingInfo lhsCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/k(),
-        /*copySizes=*/ArrayRef<int64_t>{blockTileM(), reductionTileSize},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/lhsElementalBitWidth());
-  }
-  LogicalResult validateLhsCopyMapping() const override {
-    MappingInfo mapping = lhsCopyMapping();
-    // It is fine to use fewer threads to copy the LHS.
-    if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) {
-      llvm::errs() << "too many threads used for transferring lhs: "
-                   << mapping.numThreads[0] << " * " << mapping.numThreads[1]
-                   << " > " << totalNumThreads() << "\n";
-      return failure();
-    }
-    return success();
-  }
-
-  // RHS copy is of size kxn.
-  MappingInfo rhsCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/n(),
-        /*copySizes=*/ArrayRef<int64_t>{reductionTileSize, blockTileN()},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/rhsElementalBitWidth());
-  }
-  LogicalResult validateRhsCopyMapping() const override {
-    MappingInfo mapping = rhsCopyMapping();
-    // It is fine to use fewer threads to copy the RHS.
-    if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) {
-      llvm::errs() << "too many threads used for transferring rhs: "
-                   << mapping.numThreads[0] << " * " << mapping.numThreads[1]
-                   << " > " << totalNumThreads() << "\n";
-      return failure();
-    }
-    return success();
-  }
-
-  // RES copy is of size mxn.
-  MappingInfo resCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(),
-        /*alignment=*/n(),
-        /*copySizes=*/ArrayRef<int64_t>{blockTileM(), blockTileN()},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/resElementalBitWidth());
-  }
-
-  LogicalResult validateResCopyMapping() const override {
-    MappingInfo mapping = resCopyMapping();
-    // It is fine to use fewer threads to copy the RES.
-    if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) {
-      llvm::errs() << "too many threads used for transferring res: "
-                   << mapping.numThreads[0] << " * " << mapping.numThreads[1]
-                   << " > " << totalNumThreads() << "\n";
-      return failure();
-    }
-    return success();
-  }
-
-  // COMPUTE is of size mxn.
-  MappingInfo computeMapping() const override {
-    if (useFma) {
-      // When using FMA we don't need to map to warps, instead just match what
-      // the copy does.
-      return CopyMapping::getMappingInfo(ctx, totalNumThreads(),
-                                         /*alignment=*/n(),
-                                         {blockTileM(), blockTileN()});
-    }
-    return MappingInfo{/*numThreads=*/{numWarpsY(), numWarpsX()},
-                       /*tileSizes=*/{},
-                       /*threadMapping=*/{warpY(ctx), warpX(ctx)},
-                       /*vectorSize=*/std::nullopt};
-  }
-
-  void print(llvm::raw_ostream &os) const override;
-  LLVM_DUMP_METHOD void dump() const override;
-};
-
-/// An extension of the matmul strategy to batched matrix multiplications.
-class BatchMatmulStrategy : public MatmulStrategy {
-public:
-  /// Construct the default strategy, pulling options from the command-line
-  /// arguments if provided and using the defaults otherwise.
-  BatchMatmulStrategy(MLIRContext *context, const GPUModel &gpuModel,
-                      const transform_ext::MatchedMatmulCaptures &captures)
-      : MatmulStrategy(context, captures, gpuModel) {
-    initDefaultValues(gpuModel);
-  }
-
-  /// Initialize the default values of the strategy.
-  void initDefaultValues(const GPUModel &gpuModel) override {
-    // First, initialize as if this was a simple matmul.
-    MatmulStrategy::initDefaultValues(gpuModel);
-
-    // Make sure we pad along all dimensions.
-    paddingDimensions = {0, 1, 2, 3};
-    packingDimensions = {1, 1, 1, 1};
-  }
-
-  /// Check that the strategy is valid for the captures and the model.
-  LogicalResult validate(const GPUModel &gpuModel) const override;
-
-  /// Named accessors to shapes.
-  int64_t batch() const { return captures.matmulOpSizes[0]; }
-  int64_t m() const override { return captures.matmulOpSizes[1]; }
-  int64_t n() const override { return captures.matmulOpSizes[2]; }
-  int64_t k() const override { return captures.matmulOpSizes[3]; }
-
-  /// Named accessors to block tile sizes associated with shapes.
-  int64_t blockTileBatch() const { return blockTileSizes[0]; }
-  int64_t blockTileM() const override { return blockTileSizes[1]; }
-  int64_t blockTileN() const override { return blockTileSizes[2]; }
-
-  /// Number of threads to use.
-  int64_t numThreadsX() const { return numThreads[0]; }
-  int64_t numThreadsY() const { return numThreads[1]; }
-  int64_t numThreadsZ() const { return numThreads[2]; }
-
-  /// Number of warps to use.
-  int64_t numWarpsX() const override { return numWarps[0]; }
-  int64_t numWarpsY() const override { return numWarps[1]; }
-  int64_t numWarpsZ() const { return numWarps[2]; }
-
-  MappingInfo getBlockMapping() const override {
-    return MappingInfo{
-        /*numThreads=*/
-        {},
-        /*tileSizes=*/{blockTileBatch(), blockTileM(), blockTileN()},
-        /*threadMapping=*/{blockZ(ctx), blockY(ctx), blockX(ctx)},
-        /*vectorSize=*/std::nullopt};
-  }
-
-  // LHS copy is batch x M x K.
-  MappingInfo lhsCopyMapping() const override {
-    // TODO: generalize to transpositions, here and below.
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(), k(),
-        {blockTileBatch(), blockTileM(), reductionTileSize},
-        /*favorPredication=*/false,
-        captures.lhsElementType.getIntOrFloatBitWidth());
-  }
-
-  // RHS copy is batch x K x N.
-  MappingInfo rhsCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(), n(),
-        {blockTileBatch(), reductionTileSize, blockTileN()},
-        /*favorPredication=*/false,
-        captures.rhsElementType.getIntOrFloatBitWidth());
-  }
-
-  // RES copy is batch x M x N.
-  MappingInfo resCopyMapping() const override {
-    return CopyMapping::getMappingInfo(
-        ctx, totalNumThreads(), n(),
-        {blockTileBatch(), blockTileM(), blockTileN()},
-        /*favorPredication=*/false,
-        captures.outputElementType.getIntOrFloatBitWidth());
-  }
-
-  /// Check that the mapping computed for a copy is valid.
-  LogicalResult validateLhsCopyMapping() const override {
-    return validateCopyMapping(ctx, lhsCopyMapping(), "lhs");
-  }
-  LogicalResult validateRhsCopyMapping() const override {
-    return validateCopyMapping(ctx, rhsCopyMapping(), "rhs");
-  }
-  LogicalResult validateResCopyMapping() const override {
-    return validateCopyMapping(ctx, resCopyMapping(), "result");
-  }
-
-  // Compute is of the size batch x M x N.
-  MappingInfo computeMapping() const override {
-    assert(useFma && "only fma is currently supported");
-    return MappingInfo{{numThreadsZ(), numThreadsY(), numThreadsX()},
-                       {},
-                       {threadZ(ctx), threadY(ctx), threadX(ctx)},
-                       std::nullopt};
-  }
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp
deleted file mode 100644
index a9f6ed34d8a1..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/IR/TransformTypes.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-// TODO: significantly better namespacing.
-using iree_compiler::blockX;
-using iree_compiler::blockY;
-using iree_compiler::blockZ;
-using iree_compiler::buildPad;
-using iree_compiler::TileToForallAndFuseAndDistributeResult;
-using iree_compiler::gpu::buildBufferize;
-using iree_compiler::gpu::buildConvertToAsyncCopies;
-using iree_compiler::gpu::buildDistributeOnePadOrCopyWithNumThreads;
-using iree_compiler::gpu::buildDistributeOnePadOrCopyWithTileSizes;
-using iree_compiler::gpu::PadStrategy;
-using iree_compiler::IREE::transform_dialect::
-    PopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
-using transform::MatchOp;
-using transform_ext::RegisterMatchCallbacksOp;
-
-static llvm::cl::list<int64_t> clBlockTileSizes(
-    "td-pad-strategy-blk-sizes",
-    llvm::cl::desc("block tile sizes for dims (x,y,z) for the transform "
-                   "dialect pad strategy"),
-    llvm::cl::list_init(ArrayRef<int64_t>{64, 64, 1}),
-    llvm::cl::CommaSeparated);
-static llvm::cl::list<int64_t> clNumThreads(
-    "td-pad-strategy-num-threads",
-    llvm::cl::desc("number of threads for dims (x,y,z) for the transform "
-                   "dialect pad strategy"),
-    llvm::cl::list_init(ArrayRef<int64_t>{16, 16, 1}),
-    llvm::cl::CommaSeparated);
-static llvm::cl::list<int64_t> clVectorSize(
-    "td-pad-strategy-vector-size",
-    llvm::cl::desc("vector size for the transform dialect pad strategy"),
-    llvm::cl::list_init(ArrayRef<int64_t>{4, 4}), llvm::cl::CommaSeparated);
-static llvm::cl::opt<bool> clUseAsyncCopies(
-    "td-pad-strategy-use-async-copies",
-    llvm::cl::desc(
-        "use async copies through shared memory for the pad strategy"),
-    llvm::cl::init(false));
-
-void iree_compiler::gpu::PadStrategy::initDefaultValues() {
-  blockTileSizes =
-      SmallVector<int64_t>{clBlockTileSizes.begin(), clBlockTileSizes.end()};
-  numThreads = SmallVector<int64_t>{clNumThreads.begin(), clNumThreads.end()};
-  vectorSize = SmallVector<int64_t>{clVectorSize.begin(), clVectorSize.end()};
-  useAsyncCopies = clUseAsyncCopies;
-}
-
-void iree_compiler::gpu::PadStrategy::configure(GPUModel gpuModel) {}
-
-static std::tuple<Value, Value>
-buildPadStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH,
-                                  const PadStrategy &strategy) {
-  // Step 1. Call the matcher. Note that this is the same matcher as used to
-  // trigger this compilation path, so it must always apply.
-  b.create<RegisterMatchCallbacksOp>();
-  auto [padH] = unpackRegisteredMatchCallback<1>(
-      b, "pad", transform::FailurePropagationMode::Propagate, variantH);
-
-  // Step 2. Create the block/mapping tiling level.
-  MLIRContext *ctx = b.getContext();
-  auto [tiledPadH, forallH] = buildDistributeOnePadOrCopyWithTileSizes(
-      b, variantH, padH,
-      /*tileSizes=*/{strategy.blockTileSizeY(), strategy.blockTileSizeX()},
-      /*threadDimMapping=*/{blockY(ctx), blockX(ctx)}, /*foldIfBranch=*/true);
-
-  // Step 3.Handle the workgroup count region.
-  b.create<PopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(forallH);
-  return std::make_tuple(tiledPadH, forallH);
-}
-
-void iree_compiler::gpu::buildPadStrategy(ImplicitLocOpBuilder &b,
-                                          Value variantH,
-                                          const PadStrategy &strategy) {
-  MLIRContext *ctx = b.getContext();
-  // Step 1. Apply block-level part of the strategy.
-  auto [padBlockH, forallBlockH] =
-      buildPadStrategyBlockDistribution(b, variantH, strategy);
-
-  // Step 2. Apply thread-level part of the strategy.
-  auto padThreadH = buildDistributeOnePadOrCopyWithNumThreads(
-      b, variantH, padBlockH,
-      /*numThreads=*/{strategy.numThreadsY(), strategy.numThreadsX()},
-      /*threadDimMapping=*/{threadY(ctx), threadX(ctx)}, /*foldIfBranch=*/true);
-
-  // Step 3. Masked vectorization.
-  SmallVector<bool> scalableSizes(strategy.vectorSize.size(), false);
-  b.create<transform::VectorizeOp>(padThreadH, ValueRange(),
-                                   strategy.vectorSize, nullptr, scalableSizes);
-
-  // Step 4. Lower all masked vector transfers at this point, as they make
-  // canonicalization generate incorrect IR.
-  // TODO: don't rematch, apply on the variant op directly.
-  Value funcH =
-      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
-  buildLowerMaskedTransfersAndCleanup(b, funcH);
-
-  // Step 5. Vectorize the rest of func normally.
-  funcH = buildVectorize(b, funcH, /*applyCleanups=*/true);
-
-  // Step 6. Bufferize and drop HAL descriptor from memref ops.
-  variantH = buildBufferize(b, variantH);
-
-  // Step 7. Post-bufferization mapping to blocks and threads.
-  // Need to match again since bufferize invalidated all handles.
-  // TODO: assumes a single func::FuncOp to transform, needs hardening.
-  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  funcH = buildMapToBlockAndThreads(
-      b, funcH,
-      /*blockSize=*/
-      {strategy.numThreadsX(), strategy.numThreadsY(), strategy.numThreadsZ()});
-
-  // TODO: Multi-buffering and async copies in cases where HW supports it.
-  assert(!strategy.useAsyncCopies && "not implemented yet");
-
-  // Step 8. Lower masks before returning to the default lowering pipeline.
-  buildLowerVectorMasksAndCleanup(b, funcH);
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h
deleted file mode 100644
index 45aa80837676..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_
-
-#include <array>
-
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-
-namespace mlir::iree_compiler::gpu {
-
-struct PadConfig {};
-
-/// Simple padding strategy.
-class PadStrategy : public GPUStrategy {
-public:
-  PadStrategy(MLIRContext *context,
-              const transform_ext::MatchedPadCaptures &captures,
-              const PadConfig &config, const GPUModel &gpuModel)
-      : GPUStrategy(gpuModel), ctx(context), captures(captures) {
-    initDefaultValues();
-    (void)config;
-  }
-
-  PadStrategy(const PadStrategy &) = default;
-  PadStrategy &operator=(const PadStrategy &) = default;
-
-  void initDefaultValues();
-  void configure(GPUModel gpuModel);
-
-  int64_t blockTileSizeX() const { return blockTileSizes[0]; }
-  int64_t blockTileSizeY() const { return blockTileSizes[1]; }
-  int64_t blockTileSizeZ() const { return blockTileSizes[2]; }
-  int64_t numThreadsX() const { return numThreads[0]; }
-  int64_t numThreadsY() const { return numThreads[1]; }
-  int64_t numThreadsZ() const { return numThreads[2]; }
-
-  /// Constructor quantities.
-  MLIRContext *ctx;
-  transform_ext::MatchedPadCaptures captures;
-
-  /// Tile sizes for the workgroup / determines grid size for all known
-  /// reduction strategies.
-  SmallVector<int64_t> blockTileSizes;
-  SmallVector<int64_t> numThreads;
-  SmallVector<int64_t> vectorSize;
-  // TODO: implement this case.
-  bool useAsyncCopies = false;
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp
deleted file mode 100644
index 071ab9715eef..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-// TODO: significantly better namespacing.
-using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op;
-using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp;
-using transform::FuseIntoContainingOp;
-using transform::MatchOp;
-using transform::ScalarizeOp;
-using transform::SequenceOp;
-using transform_ext::MatchCallbackOp;
-using transform_ext::RegisterMatchCallbacksOp;
-using transform_ext::StructuredOpMatcher;
-
-using iree_compiler::AbstractReductionStrategy;
-using iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle;
-using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping;
-using iree_compiler::gpu::buildCommonTrailingStrategy;
-using iree_compiler::gpu::buildDistributeVectors;
-using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth;
-using iree_compiler::gpu::ReductionConfig;
-using iree_compiler::gpu::scaleUpByBitWidth;
-using iree_compiler::gpu::SmallReductionStrategy;
-using iree_compiler::gpu::threadX;
-using iree_compiler::gpu::threadY;
-using iree_compiler::gpu::threadZ;
-
-mlir::iree_compiler::gpu::SmallReductionStrategy::SmallReductionStrategy(
-    const transform_ext::MatchedReductionCaptures &captures,
-    const ReductionConfig &reductionConfig, const GPUModel &gpuModel)
-    : AbstractReductionStrategy(captures, {}), GPUStrategy(gpuModel) {
-  configure(reductionConfig);
-  LLVM_DEBUG(DBGS() << "use GPU small reduction strategy\n");
-  LLVM_DEBUG(llvm::interleaveComma(workgroupTileSizes,
-                                   DBGS() << "--workgroupTileSizes:  ");
-             llvm::dbgs() << "\n");
-}
-
-void mlir::iree_compiler::gpu::SmallReductionStrategy::configure(
-    const ReductionConfig &reductionConfig) {
-  int64_t maxNumThreadsToUse = reductionConfig.maxNumThreads;
-  assert(maxNumThreadsToUse > 0 && "maxNumThreadsToUse must be > 0");
-  assert(maxNumThreadsToUse >= subgroupSize && "not even a warp?");
-
-  // Block-level
-  // ===========
-  // TODO: capture more dims than just the most minor parallel and have a more
-  // powerful `maybeDivisor` evaluation.
-  int64_t mostMinorParallelDimensionSize =
-      ArrayRef<int64_t>(captures.reductionOpSizes).drop_back().back();
-  FailureOr<int64_t> maybeDivisor = maxDivisorOfValueBelowLimit(
-      mostMinorParallelDimensionSize, maxNumThreadsToUse);
-
-  // Trailing elementwise unaligned tiling created bounded local buffers that
-  // are dynamic. Attempting to bound them in Common/PadDynamicAlloc.cpp results
-  // in a crash in the associated upstream util.
-  // TODO: More generally fix PadDynamicAlloc and the associated upstream util.
-  bool hasTrailingElementwise = (captures.maybeTrailingRank > 0);
-  if (failed(maybeDivisor) && hasTrailingElementwise)
-    maybeDivisor = 1;
-
-  // If the captured dimension has no satisfactory divisor, just tile the last
-  // parallel dimension by 2 * subgroupSize.
-  int64_t numParallelLoops = captures.reductionRank - 1;
-  workgroupTileSizes.append(numParallelLoops, 1);
-  workgroupTileSizes.back() =
-      hasTrailingElementwise
-          ? *maybeDivisor
-          : std::min((int64_t)maxNumThreadsToUse, (int64_t)(2 * subgroupSize));
-
-  // Thread-level
-  // ============
-  // Just running sequentially on each thread and relying on cache for
-  // locality.
-}
-
-static void buildSmallReductionStrategyThreadDistribution(
-    ImplicitLocOpBuilder &b, Value variantH, Value maybeLeadingH, Value fillH,
-    Value reductionH, Value maybeTrailingH,
-    const SmallReductionStrategy &strategy) {
-  auto [fusionTargetH, fusionGroupH] =
-      iree_compiler::buildSelectFirstNonEmpty(b, maybeTrailingH, reductionH);
-  MLIRContext *ctx = b.getContext();
-  SmallVector<Attribute> threadDimMapping{threadX(ctx), threadY(ctx),
-                                          threadZ(ctx)};
-  threadDimMapping.resize(strategy.workgroupTileSizes.size());
-  iree_compiler::TileToForallAndFuseAndDistributeResult tileResult =
-      iree_compiler::buildTileFuseDistToForallWithNumThreads(
-          /*builder=*/b,
-          /*variantH=*/variantH,
-          /*rootH=*/fusionTargetH,
-          /*opsToFuseH=*/fusionGroupH,
-          /*numThreads=*/
-          getAsOpFoldResult(b.getI64ArrayAttr(strategy.workgroupTileSizes)),
-          /*threadDimMapping=*/b.getArrayAttr(threadDimMapping));
-  fillH =
-      b.create<FuseIntoContainingOp>(fillH, tileResult.forallH).getFusedOp();
-  maybeLeadingH =
-      b.create<FuseIntoContainingOp>(maybeLeadingH, tileResult.forallH)
-          .getFusedOp();
-
-  // 1. Scalarize all ops to ensure vectorization.
-  auto anyOpType = transform::AnyOpType::get(b.getContext());
-  fillH = b.create<ScalarizeOp>(anyOpType, fillH);
-  maybeLeadingH = b.create<ScalarizeOp>(anyOpType, maybeLeadingH);
-  Value tiledH = b.create<ScalarizeOp>(anyOpType, tileResult.tiledOpH);
-  Value fusedH = b.create<ScalarizeOp>(
-      anyOpType, tileResult.resultingFusedOpsHandles.front());
-  auto [blockReductionH, maybeBlockTrailingH] =
-      iree_compiler::buildSelectFirstNonEmpty(b, fusedH, tiledH);
-
-  // 2. Apply the 1d splitting strategy to the reduction part while specifying
-  // a single thread. This triggers the splitting but not the thread mapping
-  // part.
-  build1DSplittingStrategyWithOptionalThreadMapping(
-      /*b=*/b,
-      /*variantH=*/variantH,
-      /*opH=*/blockReductionH,
-      /*rank=*/strategy.captures.reductionRank,
-      // TODO: capture and generalize mostMinorDim.
-      /*mostMinorDim=*/strategy.captures.reductionRank - 1,
-      /*opSizes=*/strategy.captures.reductionOpSizes,
-      /*numThreads=*/1);
-
-  // 3. Apply the 1d splitting strategy to the trailing elementwise part while
-  // specifying a single thread. This triggers the splitting but not the thread
-  // mapping part.
-  build1DSplittingStrategyWithOptionalThreadMapping(
-      /*b=*/b,
-      /*variantH=*/variantH,
-      /*opH=*/maybeBlockTrailingH,
-      /*rank=*/strategy.captures.maybeTrailingRank,
-      // TODO: capture and generalize mostMinorDim.
-      /*mostMinorDim=*/strategy.captures.maybeTrailingRank - 1,
-      /*opSizes=*/strategy.captures.trailingOpSizes,
-      /*numThreads=*/1);
-}
-
-void mlir::iree_compiler::gpu::buildSmallReductionStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const SmallReductionStrategy &strategy) {
-  // Step 1. Apply block-level part of the strategy, keeps everything fused.
-  ArrayRef<int64_t> workgroupTileSizes{strategy.workgroupTileSizes};
-  auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock,
-        forall] =
-      buildReductionStrategyBlockDistribution(
-          b, variantH,
-          workgroupTileSizes.take_front(strategy.captures.reductionRank - 1));
-
-  // Step 2. Apply thread-level part of the strategy, keeps everything fused.
-  buildSmallReductionStrategyThreadDistribution(
-      b, variantH, maybeLeadingHBlock, gridFillH, gridReductionH,
-      maybeTiledTrailingHBlock, strategy);
-
-  // Step 3-4. Common trailing steps.
-  buildCommonTrailingStrategy(b, variantH, strategy.getNumThreadsInBlock());
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h
deleted file mode 100644
index 79a76db6b0dd..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_
-
-#include <array>
-
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-
-namespace mlir::iree_compiler::gpu {
-
-/// Encode a strategy targeted at (very) small reductions, for which other
-/// strategies perform poorly.
-///
-/// In the case of small reductions, we cannot make an efficient use of warp
-/// shuffles. Instead, take advantage of caches.
-/// This strategy aims at running the reduction sequentially within each
-/// thread and taking parallelism from outer dimensions that we would
-/// otherwise use for block-level parallelism.
-///
-/// There are 2 cases:
-///   1. we can find good divisors of outer parallel dimensions and avoid
-///      creating dynamic tile sizes. We can then vectorize to the reduction
-///      size.
-///   2. we cannot find good divisors, we pay the price of dynamic loops.
-///
-// TODO: Refine 1. with linalg splitting on the reduction dimension.
-// TODO: Refine 2. with linalg splitting on the parallel dimension.
-//
-// Note: All this is to be able to handle very small and small-ish
-// reductions without catastrophic regressions.
-// TODO: Add another strategy based on segmented scans, which can allow us
-// to force sizes that don't divide properly into warp shuffles.
-class SmallReductionStrategy : public AbstractReductionStrategy, GPUStrategy {
-public:
-  SmallReductionStrategy(
-      const transform_ext::MatchedReductionCaptures &captures,
-      const ReductionConfig &reductionConfig, const GPUModel &gpuModel);
-
-  SmallReductionStrategy(const SmallReductionStrategy &) = default;
-  SmallReductionStrategy &operator=(const SmallReductionStrategy &) = default;
-
-  std::array<int64_t, 3> getNumThreadsInBlock() const {
-    std::array<int64_t, 3> res{1, 1, 1};
-    for (int64_t i = 0, e = workgroupTileSizes.size(); i < e; ++i)
-      res[i] = workgroupTileSizes[i];
-    return res;
-  }
-
-private:
-  /// Compute the small strategy based on the problem size and the
-  /// `maxNumThreadsToUse`.
-  void configure(const ReductionConfig &reductionConfig);
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp
deleted file mode 100644
index fbe4b11e27e5..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h"
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-// TODO: significantly better namespacing.
-using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using iree_compiler::IREE::transform_dialect::ShareForallOperandsOp;
-using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op;
-using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp;
-using transform::FuseIntoContainingOp;
-using transform::MatchOp;
-using transform::ScalarizeOp;
-using transform::SequenceOp;
-using transform_ext::StructuredOpMatcher;
-
-using iree_compiler::buildTileReductionUsingScfForeach;
-using iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle;
-using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping;
-using iree_compiler::gpu::buildCommonTrailingStrategy;
-using iree_compiler::gpu::buildDistributeVectors;
-using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth;
-using iree_compiler::gpu::ReductionConfig;
-using iree_compiler::gpu::scaleUpByBitWidth;
-using iree_compiler::gpu::StagedReductionStrategy;
-using iree_compiler::gpu::threadX;
-using iree_compiler::gpu::threadY;
-
-mlir::iree_compiler::gpu::StagedReductionStrategy::StagedReductionStrategy(
-    const transform_ext::MatchedReductionCaptures &captures,
-    const ReductionConfig &reductionConfig, const GPUModel &gpuModel)
-    : AbstractReductionStrategy(captures, {}), GPUStrategy(gpuModel) {
-  configure(reductionConfig);
-  LLVM_DEBUG(DBGS() << "use GPU staged reduction strategy\n");
-  LLVM_DEBUG(llvm::interleaveComma(workgroupTileSizes,
-                                   DBGS() << "--workgroupTileSizes:  ");
-             llvm::dbgs() << "\n");
-}
-
-void mlir::iree_compiler::gpu::StagedReductionStrategy::configure(
-    const ReductionConfig &reductionConfig) {
-  int64_t maxNumThreadsToUse = reductionConfig.maxNumThreads;
-  int64_t maxVectorSize = reductionConfig.vectorSize;
-  assert(maxNumThreadsToUse > 0 && "maxNumThreadsToUse must be > 0");
-  assert(maxNumThreadsToUse >= subgroupSize && "need at least a warp?");
-  assert(maxVectorSize > 0 && "maxVectorSize must be > 0");
-
-  // Block-level
-  // ===========
-  // Tile all the parallel dimensions to 1 and create many blocks.
-  // TODO: Investigate taking some sizes that divide the dimensions and make
-  // the kernel meatier.
-  int64_t numParallelLoops = captures.reductionRank - 1;
-  workgroupTileSizes.append(numParallelLoops, 1);
-
-  // Thread-level
-  // ============
-  // Stage 1
-  // -------
-  // Maximal vector size that divides the problem size.
-  // TODO: Split to ensure 4 on most of the problem and use a 1-epilogue.
-  int64_t reductionDimensionSize = captures.reductionOpSizes.back();
-  // Tile reduction to the maximal multiple `vectorSize` allowed.
-  // This locally reduces the large unknown reduction into a guaranteed
-  // multiple of `vectorSize`.
-  if (ShapedType::isDynamic(reductionDimensionSize)) {
-    // In the dynamic case, always run vector size of 1 and pad to the maximal
-    // warp size below the `maxNumThreadsToUse` limit.
-    vectorSize = 1;
-    numThreadsXInBlock =
-        iree_compiler::previousMultipleOf(maxNumThreadsToUse, subgroupSize);
-  } else {
-    // Adjust the vector size to the max power of 2 that divides the reduction,
-    // this dimensions the vector properly, whatever the elemental type.
-    assert((maxVectorSize & (maxVectorSize - 1)) == 0 &&
-           "maxVectorSize must be a power of 2");
-    // TODO: we could also split out the first multiple of vectorSize instead
-    // of reducing the vectorSize. This is better done with future stride /
-    // alignment in mind.
-    // TODO: splitting here also requires the post-bufferization privatization
-    // analysis (see #11715).
-    for (vectorSize = maxVectorSize; vectorSize > 1; vectorSize >>= 1)
-      if (reductionDimensionSize % vectorSize == 0)
-        break;
-    // Pad to the next multiple of the warp size above
-    // `reductionDimensionSize / vectorSize` but below `maxNumThreadsToUse`.
-    numThreadsXInBlock = std::min(
-        iree_compiler::nextMultipleOf(reductionDimensionSize / vectorSize,
-                                      subgroupSize),
-        iree_compiler::previousMultipleOf(maxNumThreadsToUse, subgroupSize));
-  }
-}
-
-static Value shareForeachArgument(ImplicitLocOpBuilder &b, Value Forall,
-                                  ArrayRef<int64_t> indices) {
-  auto foreachType = transform::OperationType::get(
-      b.getContext(), scf::ForallOp::getOperationName());
-  Forall = b.create<transform::CastOp>(foreachType, Forall);
-  return b
-      .create<iree_compiler::IREE::transform_dialect::ShareForallOperandsOp>(
-          foreachType, Forall, indices);
-}
-
-static void buildStagedReductionStrategyThreadLevel(
-    ImplicitLocOpBuilder &b, Value variantH, Value gridReductionH,
-    Value gridFillH, Value maybeTiledLeadingH, Value maybeTiledTrailingH,
-    const StagedReductionStrategy &strategy) {
-  MLIRContext *ctx = b.getContext();
-  // Map the potential maybeTiledLeadingH.
-  // TODO: Consider fusing leading elementwise into threads.
-  if (strategy.captures.maybeLeadingRank > 0) {
-    int64_t vectorSize =
-        kCudaMaxVectorLoadBitWidth /
-        strategy.captures.maybeLeadingOutputElementalTypeBitWidth;
-    assert((vectorSize & (vectorSize - 1)) == 0 && "size must be power of 2");
-    build1DSplittingStrategyWithOptionalThreadMapping(
-        /*b=*/b,
-        /*variantH=*/variantH,
-        /*opH=*/maybeTiledLeadingH,
-        /*rank=*/strategy.captures.maybeLeadingRank,
-        // TODO: capture and generalize mostMinorDim.
-        /*mostMinorDim=*/strategy.captures.maybeLeadingRank - 1,
-        /*opSizes=*/strategy.captures.leadingOpSizes,
-        /*numThreads=*/strategy.getNumThreadsInBlock().front(),
-        /*mappingAttr=*/threadX(ctx),
-        /*maxVectorSize=*/vectorSize);
-  }
-
-  // Staged reduction step 1: break gridReductionH apart.
-  auto [blockParallelForallOp, blockParallelFillH, blockCombinerOpH] =
-      buildTileReductionUsingScfForeach(
-          /*b=*/b,
-          /*isolatedParentOpH=*/variantH,
-          /*reductionH=*/gridReductionH,
-          /*reductionRank=*/strategy.captures.reductionRank,
-          /*tileSize=*/strategy.getNumThreadsInBlock().front(),
-          /*reductionVectorSize=*/strategy.getVectorSize(),
-          /*mappingAttr=*/threadX(ctx));
-
-  // Staged reduction step 2: multi-warp shuffle reduce.
-  // Map the combiner reduction to one thread along y. Mapping this part along
-  // y only will trigger the insertion of an `scf.if (threadIdx.x == 0)`
-  // predicate after `scf.forall` is lowered.
-  // This predicate allows further vector distribution to kick in.
-  Value root = blockCombinerOpH;
-  SmallVector<Value> opsToFuse = {gridFillH};
-
-  // By the properties matching, we know the optional trailing op takes the
-  // result of the reduction as an input argument.
-  // It necessarily follows that maybeTrailingRank >= reductionRank - 1.
-  // When maybeTrailingRank == reductionRank - 1, by the properties of the
-  // transformations we have applied until now, we know that the elementwise is
-  // a simple scalar operation and it can be fused in the producing reduction
-  // without creating recomputations.
-  // TODO: Some `transform.assert` op that the shape of the op is indeed 1s only
-  // as a safety measure.
-  // TODO: More composable transform strategy parts require more matching after
-  // part of the strategy has been applied. See the discussion in #11951 for
-  // more context.
-  if (strategy.captures.maybeTrailingRank ==
-      strategy.captures.reductionRank - 1) {
-    root = maybeTiledTrailingH;
-    opsToFuse.push_back(blockCombinerOpH);
-  }
-  iree_compiler::buildTileFuseDistToForallWithTileSizes(
-      /*b=*/b,
-      /*variantH=*/variantH,
-      /*rootH=*/root,
-      /*opsToFuse=*/opsToFuse,
-      /*tileSizes=*/getAsOpFoldResult(b.getI64ArrayAttr({1})),
-      /*mappingAttr=*/b.getArrayAttr(threadY(ctx)));
-
-  // Map the potential maybeTiledTrailingH if it hasn't been fused with the
-  // reduction.
-  if (root != maybeTiledTrailingH && strategy.captures.maybeTrailingRank > 0) {
-    int64_t vectorSize =
-        iree_compiler::gpu::kCudaMaxVectorLoadBitWidth /
-        strategy.captures.maybeTrailingOutputElementalTypeBitWidth;
-    build1DSplittingStrategyWithOptionalThreadMapping(
-        /*b=*/b,
-        /*variantH=*/variantH,
-        /*opH=*/maybeTiledTrailingH,
-        /*rank=*/strategy.captures.maybeTrailingRank,
-        // TODO: capture and generalize mostMinorDim.
-        /*mostMinorDim=*/strategy.captures.maybeTrailingRank - 1,
-        /*opSizes=*/strategy.captures.trailingOpSizes,
-        /*numThreads=*/strategy.getNumThreadsInBlock().front(),
-        /*mappingAttr=*/threadX(ctx),
-        /*maxVectorSize=*/vectorSize);
-  }
-}
-
-/// Builds the transform IR tiling reductions for CUDA targets. Supports
-/// reductions in the last dimension, with optional leading and trailing
-/// elementwise operations.
-void mlir::iree_compiler::gpu::buildStagedReductionStrategy(
-    ImplicitLocOpBuilder &b, Value variantH,
-    const StagedReductionStrategy &strategy) {
-  // Step 1. Match and tile to introduce the top-level scf.forall for
-  // the block/workgroup level. Keep everything fused.
-  ArrayRef<int64_t> workgroupTileSizes{strategy.workgroupTileSizes};
-  auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock,
-        commonEnclosingForallH] =
-      buildReductionStrategyBlockDistribution(
-          b, variantH,
-          workgroupTileSizes.take_front(strategy.captures.reductionRank - 1));
-
-  // Step 2. Split the reduction and tile the pieces to ensure vector
-  // load/stores and mapping to a single warp with shuffles.
-  buildStagedReductionStrategyThreadLevel(b, variantH, gridReductionH,
-                                          gridFillH, maybeLeadingHBlock,
-                                          maybeTiledTrailingHBlock, strategy);
-
-  // Step 3. Make sure we don't create allocation by sharing forall
-  // output. This amounts to injecting user-defined static information that each
-  // thread accesses only a private slice. This needs to be added late, once we
-  // don't need handles anymore, because contained handles are currently always
-  // invalidated, even when modified inplace.
-  // TODO: Relax nested invalidation for transforms that only move or modify
-  // contained ops inplace.
-  shareForeachArgument(b, commonEnclosingForallH, ArrayRef<int64_t>({0}));
-
-  // Step 4-5. Common trailing steps.
-  auto [variantH2, funcH] =
-      buildCommonTrailingStrategy(b, variantH, strategy.getNumThreadsInBlock());
-
-  // Step 6. The staged strategy has a post-bufferization vector distribution
-  // with rank-reduction. The vector distribution occurs on multiple warps and
-  // is itself internally staged in 2 stages.
-  // Distribute the reduction on all the threads of the group. This allows us
-  // to have the same data layout for the partial reduction and the merge and
-  // therefore we can optimize away the temporary memory usage.
-  buildDistributeVectors(b, variantH2, funcH, strategy.getTotalNumThreads());
-
-  // Step 7. Apply clean up of memory operations.
-  funcH = b.create<MatchOp>(variantH2, func::FuncOp::getOperationName());
-  iree_compiler::buildMemoryOptimizations(b, funcH);
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h
deleted file mode 100644
index c76195c19f8f..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_
-
-#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-
-namespace mlir::iree_compiler::gpu {
-
-/// Encode a 3-staged strategy for a 1-d reduction mapped to a block.
-///
-/// This happens in a staged fashion to encode good tradeoffs between amount
-/// of parallelism, occupancy and granularity of the load/store operations.
-/// The tradeoff is controlled at a distance by specifying a
-/// `maxNumThreadsToUse` upper bound.
-///
-/// Bottom-up perspective:
-/// ======================
-/// Stage 3: second stage of the the warp shuffle step reduces a vector<k x T>
-/// element to a single element. Only threadIdx == 0 commits to memory.
-///
-/// Stage 2: the second stage of the reduction is the first stage of the warp
-/// shuffle step. It is normalized to reduce from a "k-warps" abstraction,
-/// across all warps in parallel, to a k-element result. Only the first thread
-/// within each warp (e.g. threadIdx % subgroupSize == 0) commits to memory.
-///
-/// Stage 1: the first stage of the reduction is normalized to run on "k-warps"
-/// of maximal vector size for both the hardware and the problem sizes.
-/// The over-provisioning to "k-warps" allows multiple warps to run in parallel.
-/// The `numThreadsXInBlock` is this "k-warps" quantity and is also the
-/// number of threads (i.e. blockDim.x) used to parallelize the problem.
-/// This also results in `numThreadsXInBlock` live values that are
-/// allocated in shared memory and creates a tradeoff between parallelism and
-/// occupancy.
-/// The normalization guarantees that whatever the problem size P, we reduce
-/// from `tensor<P x T>` to `tensor<numThreadsXInBlock x T>` by using the
-/// largest possible `vector.transfer` operations. The vector size is chosen as
-/// follows: when the `reductionDimensionSize` is a multiple of 4, choose 4;
-/// otherwise try with 2; otherwise just use 1.
-//
-// TODO: Split to ensure 4 on most of the problem and use a 1-epilogue. This is
-// best done if we can encode the future stride to ensure the 4 is aligned.
-class StagedReductionStrategy : public AbstractReductionStrategy, GPUStrategy {
-public:
-  StagedReductionStrategy(
-      const transform_ext::MatchedReductionCaptures &captures,
-      const ReductionConfig &reductionConfig, const GPUModel &targetGpu);
-
-  StagedReductionStrategy(const StagedReductionStrategy &) = default;
-  StagedReductionStrategy &operator=(const StagedReductionStrategy &) = default;
-
-  std::array<int64_t, 3> getNumThreadsInBlock() const {
-    return {numThreadsXInBlock, 1, 1};
-  }
-
-  int64_t getVectorSize() const { return vectorSize; }
-
-  int64_t getNumWarps() const {
-    assert(numThreadsXInBlock % subgroupSize == 0 &&
-           "staged reduction strategy requires full warps");
-    return numThreadsXInBlock / subgroupSize;
-  }
-
-  int64_t getTotalNumThreads() const { return numThreadsXInBlock; }
-
-private:
-  /// Compute the staged strategy based on the reductionDimensionSize, the
-  /// `maxNumThreadsToUse` and the `vectorSize`.
-  /// The latter 2 numbers control the tradeoff between parallelism and shared
-  /// memory consumption.
-  // TODO: Characterize shared memory consumption and limit for good occupancy.
-  void configure(const ReductionConfig &reductionConfig);
-
-  /// Maximal vector size (among {1, 2, 4}) that divides the
-  /// `reductionDimensionSize` and is used for vector transfers in Stage 1.
-  int64_t vectorSize;
-
-  /// Maximal "k-warp" size within the limits of the `maxNumThreadsToUse` and
-  /// `reductionDimensionSize` parameters.
-  /// This is also the blockDim.x of the kernel.
-  int64_t numThreadsXInBlock;
-};
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
deleted file mode 100644
index 770091bda511..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ /dev/null
@@ -1,814 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-
-#include <tuple>
-
-#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
-#include "iree-dialects/Transforms/TransformMatchers.h"
-#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
-#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
-#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/TypeUtilities.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "iree-transform-builder"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(llvm::dbgs() << '[' << DEBUG_TYPE << "] " << X)
-
-llvm::cl::opt<bool> clGPUEnableTransformDialectMatmulTensorCoreStrategy(
-    "iree-codegen-llvmgpu-enable-transform-dialect-matmul-tensorcore-strategy",
-    llvm::cl::desc("activate the matmul tensorcore strategy"),
-    llvm::cl::init(true));
-llvm::cl::opt<bool> clGPUEnableTransformDialectImplicitGemmStrategy(
-    "iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy",
-    llvm::cl::desc("activate the convolution implicit gemm strategy"),
-    llvm::cl::init(false));
-llvm::cl::opt<bool> clGPUEnableTransformDialectAlignedMatmul(
-    "iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul",
-    llvm::cl::desc(
-        "activate the matmul tensorcore strategy for tile aligned shapes"),
-    llvm::cl::init(false));
-llvm::cl::opt<bool> clGPUEnableTransformDialectSmallMatmul(
-    "iree-codegen-llvmgpu-enable-transform-dialect-small-matmul",
-    llvm::cl::desc("activate the matmul tensorcore strategy for small shapes "
-                   "(< 16) in at least a dimension"),
-    llvm::cl::init(false));
-llvm::cl::opt<bool> clGPUEnableTransformDialectPadStrategy(
-    "iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy",
-    llvm::cl::desc("activate the pad strategy"), llvm::cl::init(false));
-llvm::cl::opt<bool> clGPUEnableTransformDialectBatchMatmulStrategy(
-    "iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy",
-    llvm::cl::desc("activate the batch matmul strategy, additional "
-                   "configuration flags are shared with matmul"),
-    llvm::cl::init(false));
-
-// TODO: significantly better namespacing.
-using iree_compiler::gpu::AbstractGemmLikeStrategy;
-using iree_compiler::gpu::BatchMatmulStrategy;
-using iree_compiler::gpu::GPUModel;
-using iree_compiler::gpu::ImplicitGemmStrategy;
-using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth;
-using iree_compiler::gpu::MatmulStrategy;
-using iree_compiler::gpu::PadConfig;
-using iree_compiler::gpu::PadStrategy;
-using iree_compiler::gpu::ReductionConfig;
-using iree_compiler::gpu::ReductionStrategy;
-using iree_compiler::gpu::scaleUpByBitWidth;
-using iree_compiler::gpu::SmallReductionStrategy;
-using iree_compiler::gpu::StagedReductionStrategy;
-using transform_ext::CapturingOpMatcher;
-using transform_ext::MatchCallbackOp;
-using transform_ext::MatchedMatmulCaptures;
-using transform_ext::MatchedPadCaptures;
-using transform_ext::MatchedReductionCaptures;
-using transform_ext::MatcherContext;
-using transform_ext::RegisterMatchCallbacksOp;
-using transform_ext::StructuredOpMatcher;
-
-//===----------------------------------------------------------------------===//
-// Higher-level problem-specific strategy creation APIs, these should favor
-// user-friendliness.
-//===----------------------------------------------------------------------===//
-
-//===--------------------------------------------------------------------===//
-// Reduction strategies.
-//===--------------------------------------------------------------------===//
-/// Placeholder to encode fixed reductions that should take finer-grained
-/// precedence over other heuristics. In the future, this could be lifted to
-/// e.g. `gpuModel` or higher up in some transform dialect database summary of
-/// "known good things".
-static FailureOr<ReductionConfig> applyKnownGoodReductionConfigurations(
-    const transform_ext::MatchedReductionCaptures &captures,
-    const GPUModel &gpuModel) {
-  auto staged = ReductionStrategy::Staged;
-  int64_t reductionSize = captures.reductionOpSizes.back();
-  if (gpuModel.model == GPUModel::kDefaultGPU) {
-    if (captures.reductionOutputElementalTypeBitWidth == 32) {
-      if (reductionSize == 64)
-        return ReductionConfig{/*maxNumThreads=*/64, /*vectorSize=*/1, staged};
-      if (reductionSize == 128)
-        return ReductionConfig{/*maxNumThreads=*/32, /*vectorSize=*/4, staged};
-      if (reductionSize == 512)
-        return ReductionConfig{/*maxNumThreads=*/256, /*vectorSize=*/2, staged};
-    }
-  }
-  return failure();
-}
-
-/// The configurations below have been determined empirically by performing a
-/// manual tradeoff between problem size, amount of parallelism and vector
-/// size on a particular NVIDIA RTX2080Ti 12GB card. This is a coarse tradeoff
-/// that should generally give reasonably good results but that begs to be
-/// complemented by hardcoded known good configurations and ultimately a
-/// database and/or a random forest compression of configurations with
-/// guaranteed performance.
-// TODO: Lift some of the strategy sizing logic as hints and/or heuristics to
-// also work properly in the dynamic case.
-// TODO: Support more HW configs and make it more pluggable.
-static ReductionConfig
-getReductionConfig(const transform_ext::MatchedReductionCaptures &captures,
-                   const GPUModel &gpuModel) {
-  auto maybeHardcodedConfiguration =
-      applyKnownGoodReductionConfigurations(captures, gpuModel);
-  if (succeeded(maybeHardcodedConfiguration))
-    return *maybeHardcodedConfiguration;
-
-  //===--------------------------------------------------------------------===//
-  // Small reduction strategy.
-  //===--------------------------------------------------------------------===//
-  // Dynamic reductions are never supported by default because we can
-  // never know offhand whether we are in a small-reduction regime mode.
-  // Since this mode does not coalesce reads, perf will suffer
-  // catastrophically on larger runtime reduction.
-  // TODO: explicit hint from above that we really want to do that.
-  int64_t redSize = captures.reductionOpSizes.back();
-  bool isDynamicReduction = ShapedType::isDynamic(redSize);
-  // Otherwise, still only support the small cases for now and fall back to
-  // other strategies otherwise.
-  bool isSmallReduction = (redSize < 2 * gpuModel.subgroupSize);
-  if (!isDynamicReduction && isSmallReduction) {
-    int64_t maxNumThreads = 4 * gpuModel.subgroupSize;
-    return ReductionConfig{maxNumThreads, 0, ReductionStrategy::Small};
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Staged reduction strategy.
-  //===--------------------------------------------------------------------===//
-  int64_t bitWidth = captures.reductionOutputElementalTypeBitWidth;
-  int64_t vectorSize = scaleUpByBitWidth(4, bitWidth);
-  int64_t maxNumThreads = 8 * gpuModel.subgroupSize;
-  // No adjustments in the dynamic case, we need extra information to make a
-  // good decision.
-  if (ShapedType::isDynamic(redSize))
-    return ReductionConfig{maxNumThreads, vectorSize,
-                           ReductionStrategy::Staged};
-  // Scale down to smaller sizes (4, 8, 16)-warps.
-  if (scaleUpByBitWidth(redSize, bitWidth) <= 4 * gpuModel.subgroupSize) {
-    vectorSize = scaleUpByBitWidth(1, bitWidth);
-    maxNumThreads = 4 * gpuModel.subgroupSize;
-  } else if (scaleUpByBitWidth(redSize, bitWidth) <=
-             8 * gpuModel.subgroupSize) {
-    vectorSize = scaleUpByBitWidth(2, bitWidth);
-    maxNumThreads = 4 * gpuModel.subgroupSize;
-  } else if (scaleUpByBitWidth(redSize, bitWidth) <=
-             8 * 2 * gpuModel.subgroupSize) {
-    vectorSize = scaleUpByBitWidth(4, bitWidth);
-    maxNumThreads = 4 * gpuModel.subgroupSize;
-  }
-  // Scale up to larger sizes (32, 64, 128+)-warps, using vector-4.
-  if (!captures.trailingOpSizes.empty()) {
-    if (scaleUpByBitWidth(redSize, bitWidth) >=
-        128 * 4 * gpuModel.subgroupSize) {
-      vectorSize = scaleUpByBitWidth(4, bitWidth);
-      maxNumThreads = 32 * gpuModel.subgroupSize;
-    } else if (scaleUpByBitWidth(redSize, bitWidth) >=
-               64 * 4 * gpuModel.subgroupSize) {
-      vectorSize = scaleUpByBitWidth(4, bitWidth);
-      maxNumThreads = 16 * gpuModel.subgroupSize;
-    } else if (scaleUpByBitWidth(redSize, bitWidth) >=
-               32 * 4 * gpuModel.subgroupSize) {
-      vectorSize = scaleUpByBitWidth(4, bitWidth);
-      maxNumThreads = 8 * gpuModel.subgroupSize;
-    } else if (scaleUpByBitWidth(redSize, bitWidth) >=
-               16 * 4 * gpuModel.subgroupSize) {
-      vectorSize = scaleUpByBitWidth(4, bitWidth);
-      maxNumThreads = 4 * gpuModel.subgroupSize;
-    }
-  }
-  return ReductionConfig{maxNumThreads, vectorSize, ReductionStrategy::Staged};
-}
-
-/// Map an N-D parallel, 1-D reduction operation with optional leading and
-/// optional trailing elementwise operations.
-/// The 1-D reduction dimension must be in the most minor dimension.
-/// The innermost dimensions of the leading and trailing operations must be
-/// most minor along all accesses. Return failure if matching fails. On a
-/// successful match, configure a reduction strategy based on a proxy model of
-/// the hardware and construct transform dialect IR that implements the
-/// reduction strategy. The transform dialect IR is added in a top-level
-/// ModuleOp after the `entryPoint` mlir::FunctionOpInterface.
-static LogicalResult
-matchAndSetReductionStrategy(mlir::FunctionOpInterface entryPoint,
-                             linalg::LinalgOp op, const GPUModel &gpuModel) {
-  if (!gpuModel.hasWarpShuffle) {
-    LDBG("--Reduction strategy no warp shuffle\n");
-    return failure();
-  }
-
-  // 1. Match a reduction and surrounding ops.
-  StructuredOpMatcher *reduction;
-  transform_ext::MatchedReductionCaptures captures;
-  transform_ext::MatcherContext matcherContext;
-  makeReductionMatcher(matcherContext, reduction, captures,
-                       /*mustMatchEntireFunc=*/true);
-  if (!matchPattern(op, *reduction)) {
-    LDBG("--Reduction strategy failed to match\n");
-    return failure();
-  }
-
-  // 2. Construct the configuration and the strategy builder.
-  // TODO: Generalize along the HW axis.
-  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
-    ReductionConfig reductionConfig = getReductionConfig(captures, gpuModel);
-    if (reductionConfig.strategy == ReductionStrategy::Small) {
-      SmallReductionStrategy strategy(captures, reductionConfig, gpuModel);
-      return buildSmallReductionStrategy(b, variant, strategy);
-    } else if (reductionConfig.strategy == ReductionStrategy::Staged) {
-      // Otherwise, always fallback to the staged strategy.
-      StagedReductionStrategy strategy(captures, reductionConfig, gpuModel);
-      return buildStagedReductionStrategy(b, variant, strategy);
-    } else {
-      return llvm_unreachable("Unknown strategy");
-    }
-  };
-
-  // 3. Build strategy embedded into the IR.
-  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
-
-  return success();
-}
-
-//===--------------------------------------------------------------------===//
-// Matmul strategies.
-//===--------------------------------------------------------------------===//
-/// Placeholder to encode fixed matmuls that should take finer-grained
-/// precedence over other heuristics. In the future, this could be lifted to
-/// e.g. `gpuModel` or higher up in some transform dialect database summary of
-/// "known good things".
-static FailureOr<MatmulStrategy> applyKnownGoodMatmulConfigurations(
-    const transform_ext::MatchedMatmulCaptures &captures,
-    const GPUModel &gpuModel) {
-  return failure();
-}
-
-static int64_t
-selectLargestFailsafeValueIfNeeded(int64_t value, int64_t limit,
-                                   ArrayRef<int64_t> thresholds,
-                                   ArrayRef<int64_t> failSafeValues) {
-  for (auto [threshold, failSafeValue] :
-       llvm::zip(thresholds, failSafeValues)) {
-    if (limit < threshold && value > failSafeValue)
-      return failSafeValue;
-  }
-  return value;
-}
-
-static void failSafeOverrides(MatmulStrategy &strategy,
-                              const GPUModel &gpuModel) {
-  // Failsafe for blockTileM to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileM = selectLargestFailsafeValueIfNeeded(
-      /*value=*/strategy.blockTileM(),
-      /*limit=*/strategy.m(),
-      /*thresholds=*/{2, 4, 8, 16, 32, 64, 128},
-      /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64});
-  // Failsafe for blockTileN to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileN = selectLargestFailsafeValueIfNeeded(
-      /*value=*/strategy.blockTileN(),
-      /*limit=*/strategy.n(),
-      /*thresholds=*/{2, 4, 8, 16, 32, 64, 128},
-      /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64});
-  // Failsafe for reductionSize to avoid tiling by > size (i.e. no tiling).
-  int64_t reductionTileSize = selectLargestFailsafeValueIfNeeded(
-      /*value=*/strategy.reductionTileSize,
-      /*limit=*/strategy.k(),
-      /*thresholds=*/{2, 4, 8, 16, 24, 32, 40, 48, 56, 64},
-      /*failSafeValues=*/{1, 2, 4, 8, 16, 24, 32, 40, 48, 56});
-
-  // If some dimension is small, use fmas.
-  // TODO: more parallelism by locally splitting the K-loop and reducing in the
-  // fma case.
-  if (blockTileM < 16 || blockTileN < 16 || reductionTileSize < 16) {
-    strategy.useMmaSync = false;
-    strategy.useWmma = false;
-    strategy.useFma = true;
-  }
-
-  strategy.blockTileSizes = {blockTileM, blockTileN};
-  strategy.reductionTileSize = reductionTileSize;
-
-  // Avoid too deep pipelines. This should also look at shared memory usage in
-  // the future.
-  if (strategy.pipelineDepth * strategy.reductionTileSize > strategy.k()) {
-    strategy.pipelineDepth =
-        llvm::divideFloorSigned(strategy.k(), strategy.reductionTileSize);
-  }
-}
-
-/// The configurations below have been determined empirically.
-// TODO: Significantly improve these heuristics.
-static MatmulStrategy
-getMatmulConfig(MLIRContext *context,
-                const transform_ext::MatchedMatmulCaptures &captures,
-                const GPUModel &gpuModel) {
-  MatmulStrategy strategy(context, captures, gpuModel);
-  if (strategy.cliOptionsSpecified)
-    return strategy;
-
-  auto maybeHardcodedConfiguration =
-      applyKnownGoodMatmulConfigurations(captures, gpuModel);
-  if (succeeded(maybeHardcodedConfiguration))
-    return *maybeHardcodedConfiguration;
-
-  // TODO: encode a decision tree of reasonnable heuristics here.
-
-  // Apply failsafe overrides to avoid identified bad corner cases.
-  failSafeOverrides(strategy, gpuModel);
-
-  return strategy;
-}
-
-/// Update the strategy to make sure it can be consumed by the codegen. In
-/// particular, make sure that tile sizes are smaller than the problem sizes to
-/// actually trigger tiling and mapping to blocks and threads.
-static void failSafeOverrides(BatchMatmulStrategy &strategy,
-                              const GPUModel &gpuModel) {
-  // Configure the strategy as if for a matmul.
-  failSafeOverrides(static_cast<MatmulStrategy &>(strategy), gpuModel);
-
-  // Failsafe for blockTileBatch to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileBatch = selectLargestFailsafeValueIfNeeded(
-      /*value=*/strategy.blockTileBatch(),
-      /*limit=*/strategy.batch(),
-      /*thresholds=*/{2, 4, 8, 16, 32, 64, 128},
-      /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64});
-
-  // Override the matmul configuration to be suitable for batch matmul.
-  // Specifically, prepend the tile size for the batch dimension and force FMA.
-  strategy.blockTileSizes.insert(strategy.blockTileSizes.begin(),
-                                 blockTileBatch);
-
-  strategy.useMmaSync = false;
-  strategy.useWmma = false;
-  strategy.useFma = true;
-}
-
-/// Produce a strategy for the batch matmul characterized by the given capture
-/// list (shapes and types).
-static BatchMatmulStrategy getBatchMatmulConfig(MLIRContext *context,
-                                                MatchedMatmulCaptures &captures,
-                                                const GPUModel &gpuModel) {
-  // Command-line arguments trump everything.
-  BatchMatmulStrategy strategy(context, gpuModel, captures);
-  if (strategy.cliOptionsSpecified)
-    return strategy;
-
-  // TODO: fixed strategies and decision tree/heuristic.
-
-  failSafeOverrides(strategy, gpuModel);
-  return strategy;
-}
-
-/// Match the supported batch matmuls and set the transform dialect strategy for
-/// them.
-static LogicalResult
-matchAndSetBatchMatmulStrategy(mlir::FunctionOpInterface entryPoint,
-                               linalg::LinalgOp op, const GPUModel &gpuModel) {
-  if (!clGPUEnableTransformDialectBatchMatmulStrategy) {
-    LDBG("--Batch matmul strategy flag turned off\n");
-    return failure();
-  }
-
-  StructuredOpMatcher *fill;
-  StructuredOpMatcher *bmm;
-  transform_ext::MatchedMatmulCaptures captures;
-  transform_ext::MatcherContext matcherContext;
-  transform_ext::makeBatchMatmulMatcher(matcherContext, bmm, fill, captures,
-                                        /*mustMatchEntireFunc=*/true);
-  if (!matchPattern(op, *bmm)) {
-    LDBG("--Batch matmul strategy failed to match\n");
-    return failure();
-  }
-
-  if (captures.contractionDims.batch.size() != 1 ||
-      captures.contractionDims.m.size() != 1 ||
-      captures.contractionDims.n.size() != 1 ||
-      captures.contractionDims.k.size() != 1 || captures.batches()[0] != 0 ||
-      captures.m() != 1 || captures.n() != 2 || captures.k() != 3) {
-    LDBG("--Only support batch matmul with b, m, n, k iterator order atm\n");
-    return failure();
-  }
-
-  BatchMatmulStrategy strategy =
-      getBatchMatmulConfig(entryPoint->getContext(), captures, gpuModel);
-  if (failed(strategy.validate(gpuModel))) {
-    LDBG("--Batch matmul strategy failed to validate\n");
-    return failure();
-  }
-
-  iree_compiler::createTransformRegion(entryPoint, [&](ImplicitLocOpBuilder &b,
-                                                       Value variantH) {
-    return iree_compiler::gpu::buildBatchMatmulStrategy(b, variantH, strategy);
-  });
-  return success();
-}
-
-static LogicalResult
-matchAndSetMatmulStrategy(mlir::FunctionOpInterface entryPoint,
-                          linalg::LinalgOp op, const GPUModel &gpuModel) {
-  if (!clGPUEnableTransformDialectMatmulTensorCoreStrategy) {
-    LDBG("--Matmul strategy flag turned off\n");
-    return failure();
-  }
-
-  // 1. Match a reduction and surrounding ops.
-  StructuredOpMatcher *fill;
-  StructuredOpMatcher *matmul;
-  StructuredOpMatcher *trailing;
-  transform_ext::MatchedMatmulCaptures captures;
-  transform_ext::MatcherContext matcherContext;
-  makeMatmulMatcher(matcherContext, matmul, fill, trailing, captures,
-                    /*mustMatchEntireFunc=*/true);
-  if (!matchPattern(op, *matmul)) {
-    LDBG("--Matmul strategy fail to match\n");
-    return failure();
-  }
-
-  // We are very peculiar about the dispatches we want to match for now:
-  //   - f32 only atm.
-  //   - Mandatory fill op.
-  //   - No trailing op.
-  //   - If the matmul is "too aligned", then guard on the alignment flag.
-  //   - If the matmul is "too small", then use the default IREE strategy.
-  //   - Otherwise, we take it.
-  if (!fill->getCaptured() || trailing->getCaptured()) {
-    LDBG("--Matmul strategy fill / trailing preconditions failed\n");
-    return failure();
-  }
-
-  // TODO: Generalize to a good mix of sizes, alignments and element types.
-  const auto &matmulSize = captures.matmulOpSizes;
-  if (matmulSize.size() != 3) {
-    LDBG("--Matmul strategy size capture failed\n");
-    return failure();
-  }
-
-  // Currently the unaligned transform strategy does not properly handle
-  // degenerate dimensions that should have been rank-reduced (e.g. `1`).
-  // Also, it is unprofitable to force small matmuls through a high latency
-  // tensorcore path, we are better off with a simple simt strategy.
-  // TODO: profitability details can be ironed out in the future when we have a
-  // heuristic to better select strategy parameters.
-  bool smallCases = (matmulSize[0] > 0 && matmulSize[0] < 16) ||
-                    (matmulSize[1] > 0 && matmulSize[1] < 16) ||
-                    (matmulSize[2] > 0 && matmulSize[2] < 16);
-  if (smallCases && !clGPUEnableTransformDialectSmallMatmul) {
-    LDBG("--Matmul strategy small size check failed\n");
-    return failure();
-  }
-
-  // Currently the fully aligned case still lags behind the current default
-  // pipeline and thus is guarded by a flag. This is the case when at least one
-  // of the following holds
-  //   - m is tile aligned (conservatively, take 64)
-  //   - n is tile aligned (conservatively, take 64)
-  //   - k is tile aligned (conservatively, take 16)
-  bool guardedAlignedCases = matmulSize[0] % 64 == 0 ||
-                             matmulSize[1] % 64 == 0 || matmulSize[2] % 16 == 0;
-
-  if (!smallCases && guardedAlignedCases &&
-      !clGPUEnableTransformDialectAlignedMatmul) {
-    LDBG("--Matmul strategy alignment check failed\n");
-    return failure();
-  }
-
-  iree_compiler::gpu::MatmulStrategy strategy =
-      getMatmulConfig(op->getContext(), captures, gpuModel);
-  LLVM_DEBUG(strategy.dump());
-
-  // Validate the strategy configuration against the compilation target.
-  if (failed(strategy.validate(gpuModel))) {
-    LDBG("--Matmul strategy failed to validate\n");
-    return failure();
-  }
-
-  // Limit the types that we choose to support without user intervention for
-  // tensor core.
-  if (!strategy.useFma && !strategy.cliOptionsSpecified &&
-      (!captures.lhsElementType.isF32() || !captures.rhsElementType.isF32() ||
-       !captures.outputElementType.isF32())) {
-    LDBG("--Matmul strategy elemental type check failed\n");
-    return failure();
-  }
-
-  // 2. Construct the configuration and the strategy builder.
-  // TODO: Generalize along the HW axis.
-  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
-    return buildMatmulTensorCoreStrategy(b, variant, strategy);
-  };
-
-  // 3. Build strategy embedded into the IR.
-  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
-
-  return success();
-}
-
-//===--------------------------------------------------------------------===//
-// Convolution strategies.
-//===--------------------------------------------------------------------===//
-/// Placeholder to encode fixed convolutions that should take finer-grained
-/// precedence over other heuristics. In the future, this could be lifted to
-/// e.g. `gpuModel` or higher up in some transform dialect database summary of
-/// "known good things".
-static FailureOr<ImplicitGemmStrategy> applyKnownGoodConvolutionConfigurations(
-    const transform_ext::MatchedConvolutionCaptures &captures,
-    const GPUModel &gpuModel) {
-  return failure();
-}
-
-static void failSafeOverrides(ImplicitGemmStrategy &strategy,
-                              const GPUModel &gpuModel) {
-  // Prefer a default block tile of 1 for the batch.
-  strategy.blockTileSizes = SmallVector<int64_t>{1, 128, 128};
-  // Failsafe for blockTileM to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileM = selectLargestFailsafeValueIfNeeded(
-      strategy.blockTileM(), strategy.m(), {16, 32, 64, 128}, {1, 16, 32, 64});
-  // Failsafe for blockTileN to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileN = selectLargestFailsafeValueIfNeeded(
-      strategy.blockTileN(), strategy.n(), {16, 32, 64, 128}, {1, 16, 32, 64});
-  // Failsafe for reductionSize to avoid tiling by > size (i.e. no tiling).
-  int64_t reductionTileSize = selectLargestFailsafeValueIfNeeded(
-      strategy.reductionTileSize, strategy.k(), {8, 16, 24, 32, 40, 48, 56, 64},
-      {1, 8, 16, 24, 32, 40, 48, 56});
-  // Failsafe for blockTileBatch to avoid tiling by > size (i.e. no tiling).
-  int64_t blockTileBatch = selectLargestFailsafeValueIfNeeded(
-      /*value=*/strategy.blockTileBatch(),
-      /*limit=*/strategy.batch(),
-      /*thresholds=*/{2, 4, 8, 16, 32, 64, 128},
-      /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64});
-  strategy.blockTileSizes = {blockTileBatch, blockTileM, blockTileN};
-  strategy.reductionTileSize = reductionTileSize;
-  // Avoid too deep pipelines. This should also look at shared memory usage in
-  // the future.
-  if (strategy.pipelineDepth * strategy.reductionTileSize > strategy.k()) {
-    strategy.pipelineDepth =
-        llvm::divideFloorSigned(strategy.k(), strategy.reductionTileSize);
-  }
-}
-
-/// The configurations below have been determined empirically.
-// TODO: Significantly improve these heuristics.
-static ImplicitGemmStrategy
-getConvolutionConfig(MLIRContext *context,
-                     const transform_ext::MatchedConvolutionCaptures &captures,
-                     const GPUModel &gpuModel) {
-  ImplicitGemmStrategy strategy(context, captures, gpuModel);
-  if (strategy.cliOptionsSpecified)
-    return strategy;
-
-  auto maybeHardcodedConfiguration =
-      applyKnownGoodConvolutionConfigurations(captures, gpuModel);
-  if (succeeded(maybeHardcodedConfiguration))
-    return *maybeHardcodedConfiguration;
-
-  // TODO: encode a decision tree of reasonnable heuristics here.
-
-  // Apply failsafe overrides to avoid identified bad corner cases.
-  failSafeOverrides(strategy, gpuModel);
-
-  return strategy;
-}
-
-static LogicalResult
-matchAndSetConvolutionStrategy(mlir::FunctionOpInterface entryPoint,
-                               linalg::LinalgOp op, const GPUModel &gpuModel) {
-  if (!clGPUEnableTransformDialectImplicitGemmStrategy) {
-    LDBG("--Implicit gemm strategy flag turned off\n");
-    return failure();
-  }
-
-  // 1. Match a reduction and surrounding ops.
-  StructuredOpMatcher *fill;
-  StructuredOpMatcher *convolution;
-  StructuredOpMatcher *trailing;
-  transform_ext::MatchedConvolutionCaptures captures;
-  transform_ext::MatcherContext matcherContext;
-  makeConvolutionMatcher(matcherContext, convolution, fill, trailing, captures,
-                         /*mustMatchEntireFunc=*/true);
-  if (!matchPattern(op, *convolution)) {
-    LDBG("--Implicit gemm strategy fail to match\n");
-    return failure();
-  }
-
-  // We are very peculiar about the dispatches we want to match for now:
-  //   - f32 or f16 only atm.
-  //   - Mandatory fill op.
-  //   - Require minimum tile alignment due to img2col.
-  //   - Otherwise, we take it.
-  if (!fill->getCaptured() || trailing->getCaptured()) {
-    LDBG("--Implicit gemm strategy fill / trailing preconditions failed\n");
-    return failure();
-  }
-
-  // Currently requires a typical 2d named convolution (conv_2d_nchw/nhwc).
-  if (captures.convolutionDims.outputChannel.size() != 1) {
-    return failure();
-  }
-  if (captures.convolutionDims.inputChannel.size() != 1) {
-    return failure();
-  }
-  if (captures.convolutionDims.outputImage.size() != 2) {
-    return failure();
-  }
-  if (captures.convolutionDims.filterLoop.size() != 2) {
-    return failure();
-  }
-  if (captures.convolutionDims.batch.size() != 1) {
-    return failure();
-  }
-
-  int64_t channelSize = 1;
-  for (auto dim : captures.convolutionDims.outputChannel)
-    channelSize *= captures.convolutionOpSizes[dim];
-  int64_t imageSize = 1;
-  for (auto dim : captures.convolutionDims.outputImage)
-    imageSize *= captures.convolutionOpSizes[dim];
-
-  int64_t derivedK = 1;
-  for (auto dim : captures.convolutionDims.filterLoop)
-    derivedK *= captures.convolutionOpSizes[dim];
-  for (auto dim : captures.convolutionDims.inputChannel)
-    derivedK *= captures.convolutionOpSizes[dim];
-
-  // Require tile-aligned due to the img2col op.
-  if (channelSize % 64 || imageSize % 64 || derivedK % 16) {
-    LDBG("--Implicit gemm strategy alignment check failed\n");
-    return failure();
-  }
-
-  iree_compiler::gpu::ImplicitGemmStrategy strategy =
-      getConvolutionConfig(op->getContext(), captures, gpuModel);
-
-  // Validate the strategy configuration against the compilation target.
-  if (failed(strategy.validate(gpuModel))) {
-    LDBG("--Implicit gemm strategy failed to validate\n");
-    return failure();
-  }
-
-  // 2. Construct the configuration and the strategy builder.
-  // TODO: Generalize along the HW axis.
-  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
-    return buildConvolutionImplicitGemmStrategy(b, variant, strategy);
-  };
-
-  // 3. Build strategy embedded into the IR.
-  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
-
-  return success();
-}
-
-//===--------------------------------------------------------------------===//
-// Pad strategies.
-//===--------------------------------------------------------------------===//
-
-/// Placeholder to encode fixed pads that should take finer-grained precedence
-/// over other heuristics. In the future, this could be lifted to
-/// e.g. `gpuModel` or higher up in some transform dialect database summary of
-/// "known good things".
-static FailureOr<PadConfig> applyKnownGoodPadConfigurations(
-    const transform_ext::MatchedPadCaptures &captures,
-    const GPUModel &gpuModel) {
-  if (ArrayRef<int64_t>{captures.dims} == ArrayRef<int64_t>{1024, 1024}) {
-    return PadConfig{};
-  }
-  return failure();
-}
-
-/// Placeholder to encode simple heuristics.
-static PadConfig getPadConfig(const transform_ext::MatchedPadCaptures &captures,
-                              const GPUModel &gpuModel) {
-  auto maybeHardcodedConfiguration =
-      applyKnownGoodPadConfigurations(captures, gpuModel);
-  if (succeeded(maybeHardcodedConfiguration))
-    return *maybeHardcodedConfiguration;
-  return PadConfig{};
-}
-
-static LogicalResult
-matchAndSetPadStrategy(mlir::FunctionOpInterface entryPoint, tensor::PadOp op,
-                       const GPUModel &gpuModel) {
-  if (!clGPUEnableTransformDialectPadStrategy) {
-    LDBG("--Pad strategy flag turned off\n");
-    return failure();
-  }
-
-  // 1. Match a padOp.
-  CapturingOpMatcher *pad;
-  MatchedPadCaptures captures;
-  MatcherContext matcherContext;
-  makePadMatcher(matcherContext, pad, captures, /*mustMatchEntireFunc=*/true);
-
-  if (!matchPattern(op.getOperation(), *pad)) {
-    LDBG("--Pad strategy failed to match\n");
-    return failure();
-  }
-  if (captures.rank != 2) {
-    LDBG("--Pad strategy supported ranks check failed\n");
-    return failure();
-  }
-  if (!captures.elementType.isF32()) {
-    LDBG("--Pad strategy elemental type check failed\n");
-    return failure();
-  }
-
-  // 2. Construct the strategy builder.
-  PadConfig padConfig = getPadConfig(captures, gpuModel);
-  iree_compiler::gpu::PadStrategy strategy(op->getContext(), captures,
-                                           padConfig, gpuModel);
-  if (strategy.useAsyncCopies) {
-    LDBG("--Async copies not supported yet\n");
-    return failure();
-  }
-  if (strategy.numThreads.size() > 3) {
-    LDBG("--Can only assign 3 num threads\n");
-    return failure();
-  }
-  // Make sure all thread numbers are set.
-  if (strategy.numThreads.size() != 3) {
-    strategy.numThreads.resize(3, 1);
-  }
-
-  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
-    return buildPadStrategy(b, variant, strategy);
-  };
-
-  // 3. Build strategy embedded into the IR.
-  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
-
-  return success();
-}
-
-//===--------------------------------------------------------------------===//
-// Switch between strategies depending on matched IR.
-//===--------------------------------------------------------------------===//
-LogicalResult mlir::iree_compiler::gpu::matchAndSetTransformStrategy(
-    mlir::FunctionOpInterface entryPoint, Operation *op,
-    const GPUModel &gpuModel) {
-  LDBG("Look up a TD strategy for entryPoint:\n" << entryPoint << "\n");
-  auto padOp = dyn_cast<tensor::PadOp>(op);
-  if (padOp) {
-    if (succeeded(matchAndSetPadStrategy(entryPoint, padOp, gpuModel))) {
-      LDBG("Activate pad strategy\n");
-      return success();
-    }
-    LDBG("Unmatched pad strategy\n");
-    return failure();
-  }
-  auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  if (!linalgOp) {
-    LDBG("Not a Linalg op: " << *op << " -> Fail\n");
-    return failure();
-  }
-  if (succeeded(matchAndSetReductionStrategy(entryPoint, linalgOp, gpuModel))) {
-    LDBG("Activate reduction strategy\n");
-    return success();
-  }
-  if (succeeded(matchAndSetMatmulStrategy(entryPoint, linalgOp, gpuModel))) {
-    LDBG("Activate matmul\n");
-    return success();
-  }
-  if (succeeded(
-          matchAndSetBatchMatmulStrategy(entryPoint, linalgOp, gpuModel))) {
-    LDBG("Activate batch matmul\n");
-    return success();
-  }
-  if (succeeded(
-          matchAndSetConvolutionStrategy(entryPoint, linalgOp, gpuModel))) {
-    LDBG("Activate convolution\n");
-    return success();
-  }
-  // TODO: Add more transform dialect strategy for other kind of dispatch
-  // regions.
-  LDBG("No suitable strategy found\n");
-  return failure();
-}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
deleted file mode 100644
index d8093faa9840..000000000000
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_
-#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_
-
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-
-namespace mlir {
-class ImplicitLocOpBuilder;
-class Value;
-} // namespace mlir
-
-namespace mlir::iree_compiler::gpu {
-
-/// Forward declarations of all supported strategies.
-class BatchMatmulStrategy;
-class MatmulStrategy;
-class PadStrategy;
-class SmallReductionStrategy;
-class StagedReductionStrategy;
-
-static constexpr int64_t kCudaWarpSize = 32;
-static constexpr int64_t kCudaMaxNumThreads = 1024;
-
-/// Struct for representing supported WMMA/Cooperative Matrix configurations.
-/// This is a reflection of SPIRV_CooperativeMatrixPropertiesNVAttr.
-struct MMAConfig {
-  int64_t m;
-  int64_t n;
-  int64_t k;
-  Type aType;
-  Type bType;
-  Type cType;
-};
-
-/// Placeholder for some hardware model proxy that contains relevant information
-/// to configure the strategies. In the future, this will need to be
-/// driven by some contract with the runtime.
-struct GPUModel {
-  static constexpr llvm::StringLiteral kDefaultGPU = "DefaultGPU";
-  llvm::StringRef model = kDefaultGPU;
-  /// TODO: Support a range of subgroup sizes.
-  int64_t subgroupSize = kCudaWarpSize;
-  std::optional<int> minSubgroupSize = std::nullopt;
-  std::optional<int> maxSubgroupSize = std::nullopt;
-  int64_t maxWorkGroupInvocations = kCudaMaxNumThreads;
-  int64_t maxWorkGroupSize[3] = {1024, 1024, 64};
-  bool hasWarpShuffle = false;
-  bool hasTF32TensorCore = false;
-  bool hasMmaSync = false;
-  SmallVector<MMAConfig> supportedWMMAConfigs = {};
-};
-
-//===--------------------------------------------------------------------===//
-// GPU strategy base.
-//===--------------------------------------------------------------------===//
-/// Basic structure to hold target specific information needed for all gpu
-/// strategies. Certain quantities that can be dynamically selected, such as
-/// subgroup size, will need to be configured with some contract with the
-/// runtime.
-struct GPUStrategy {
-  /// TODO: Configure subgroup size with the strategy and return the selected
-  /// size to the target (i.e. LLVMGPU or SPIR-V).
-  GPUStrategy(const GPUModel &gpuModel) : subgroupSize(gpuModel.subgroupSize) {}
-  /// TODO: Add other quantities relevant to strategy builders.
-  int64_t subgroupSize;
-};
-
-//===--------------------------------------------------------------------===//
-// Matmul strategies.
-//===--------------------------------------------------------------------===//
-/// Entry point to build the transform IR corresponding to a tensorcore-based
-/// strategy for linalg.fill + linalg.matmul on f32.
-/// Does not support leading or trailing operations atm.
-void buildMatmulTensorCoreStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                                   const MatmulStrategy &strategy);
-
-//===--------------------------------------------------------------------===//
-// Batch matmul strategies.
-//===--------------------------------------------------------------------===//
-/// Entry point to build the transform IR corresponding to an FMA-based strategy
-/// for linalg.fill + linalg.batch_matmul.
-void buildBatchMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                              const BatchMatmulStrategy &strategy);
-
-//===--------------------------------------------------------------------===//
-// Pad strategies.
-//===--------------------------------------------------------------------===//
-/// Entry point to build the transform IR corresponding to a simple pad
-/// strategy.
-/// Does not support leading or trailing operations atm.
-void buildPadStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                      const PadStrategy &strategy);
-
-//===--------------------------------------------------------------------===//
-// Reduction strategies.
-//===--------------------------------------------------------------------===//
-/// Structure to hold a summary of HW-derived properties to configure the
-/// reduction strategy.
-/// The objective of this struct is to act as a minimal summary of key
-/// properties derived from the hardware (e.g. by an oracle) and that are
-/// sufficient to steer the strategy to produce a good version.
-/// These can be thought of as latent variables or embeddings that directly
-/// control the strategy and can be derived from the hardware by some procedure.
-enum class ReductionStrategy { Small, Staged };
-struct ReductionConfig {
-  int64_t maxNumThreads;
-  int64_t vectorSize;
-  ReductionStrategy strategy;
-};
-
-/// Entry point to build the transform IR corresponding to a staged reduction
-/// strategy.
-/// This is used for mapping a N-D parallel, 1-D reduction operation with a
-/// small reduction on which the default staged reduction strategy is otherwise
-/// inefficient.
-/// The 1-D reduction dimensions must be in the most minor dimension.
-/// Supports an optional leading and an optional trailing elementwise operation.
-void buildSmallReductionStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                                 const SmallReductionStrategy &strategy);
-
-/// Entry point to build the transform IR corresponding to a staged reduction
-/// strategy.
-/// This is used for mapping a N-D parallel, 1-D reduction operation.
-/// The 1-D reduction dimensions must be in the most minor dimension.
-/// Supports an optional leading and an optional trailing elementwise operation.
-void buildStagedReductionStrategy(ImplicitLocOpBuilder &b, Value variantH,
-                                  const StagedReductionStrategy &strategy);
-
-//===----------------------------------------------------------------------===//
-// Higher-level strategy creation APIs, these should favor
-// user-friendliness.
-//===----------------------------------------------------------------------===//
-
-/// Try to find an exisiting transform dialect strategy for a given entry point.
-LogicalResult matchAndSetTransformStrategy(mlir::FunctionOpInterface entryPoint,
-                                           Operation *op,
-                                           const GPUModel &gpuModel);
-
-} // namespace mlir::iree_compiler::gpu
-
-#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
index 7ab5d266dbdb..c4d5d62349df 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
@@ -8,14 +8,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/VMVX/KernelDispatch.h"
 #include "iree/compiler/Codegen/VMVX/Passes.h"
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -34,18 +26,7 @@ class VMVXSelectLoweringStrategyPass
           VMVXSelectLoweringStrategyPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    // TODO(qedawkins): Once TransformStrategies is deprecated, drop the
-    // unnecessary dialect registrations.
-    // clang-format off
-    registry.insert<IREE::Codegen::IREECodegenDialect,
-                    IREE::HAL::HALDialect,
-                    IREE::LinalgExt::IREELinalgExtDialect,
-                    bufferization::BufferizationDialect,
-                    linalg::LinalgDialect,
-                    scf::SCFDialect,
-                    tensor::TensorDialect,
-                    vector::VectorDialect>();
-    // clang-format on
+    registry.insert<IREE::Codegen::IREECodegenDialect>();
   }
 
   void runOnOperation() override;