diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fc7859be5a0f..11cf13cc35c2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -62,7 +62,6 @@ /compiler/src/iree/compiler/Codegen/LLVMCPU/ @hanhanW @MaheshRavishankar /compiler/src/iree/compiler/Codegen/LLVMGPU/ @MaheshRavishankar @qedawkins @kuhar @Groverkss /compiler/src/iree/compiler/Codegen/SPIRV/ @antiagainst @MaheshRavishankar @kuhar -/compiler/src/iree/compiler/Codegen/TransformStrategies/ @qedawkins @MaheshRavishankar /compiler/src/iree/compiler/ConstEval/ @hanhanW @stellaraccident /compiler/src/iree/compiler/Dialect/Encoding/ @bjacob @hanhanW /compiler/src/iree/compiler/Dialect/Flow/ @hanhanW @MaheshRavishankar @IanWood1 diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel index 616e39394d4f..7aca986d540b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel @@ -307,8 +307,6 @@ iree_compiler_cc_library( "@llvm-project//llvm:Support", "@llvm-project//mlir:Support", "@llvm-project//mlir:DialectUtils", - # TransformStrategies - "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies", # TransformExtensions (needed for registration in the pass) "//llvm-external-projects/iree-dialects:IREEDialectsTransforms", "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions", diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt index 648805b515ee..764bc258c902 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt @@ -283,7 +283,6 @@ iree_cc_library( iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect iree::compiler::Codegen::LLVMCPU::TransformExtensions::LLVMCPUExtensions iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions - iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies iree::compiler::Dialect::Encoding::IR iree::compiler::Dialect::Flow::IR iree::compiler::Dialect::Flow::TransformExtensions::FlowExtensions diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel index 2a96c4beb54a..e2e15dbbfa68 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel @@ -93,7 +93,6 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Common/CPU:CommonCPUPasses", "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect", "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface", - "//compiler/src/iree/compiler/Codegen/TransformStrategies/CPU", "//compiler/src/iree/compiler/Codegen/Transforms", "//compiler/src/iree/compiler/Codegen/Utils", "//compiler/src/iree/compiler/Dialect/Flow/IR", diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt index 8db7e3770149..9eb8dc155d0c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt @@ -151,7 +151,6 @@ iree_cc_library( iree::compiler::Codegen::Common::TransformDialectInterpreterPass iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface - iree::compiler::Codegen::TransformStrategies::CPU iree::compiler::Codegen::Transforms iree::compiler::Codegen::Utils iree::compiler::Dialect::Flow::IR diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index c68c905c14d1..6f9983454af5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -11,7 +11,6 @@ #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" #include "iree/compiler/Codegen/LLVMCPU/TargetMLTransformInfo.h" #include "iree/compiler/Codegen/LLVMCPU/Utils.h" -#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h" #include "iree/compiler/Codegen/Utils/CPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" #include "iree/compiler/Codegen/Utils/Utils.h" @@ -100,12 +99,6 @@ static llvm::cl::opt clDisableArmSMETiling( "target (i.e., when the +sme feature flag is present)"), llvm::cl::init(false)); -// Non-static options are used in other places. -llvm::cl::opt clEnableTransformDialectJit( - "iree-llvmcpu-enable-transform-dialect-jit", - llvm::cl::desc("enable the usage of the transform dialect JIT"), - llvm::cl::init(false)); - using IREE::Codegen::DispatchLoweringPassPipeline; // Encodes the pre-processing strategy to be applied on a Linalg operation @@ -2007,28 +2000,6 @@ setDefaultGenericOpRootConfig(mlir::FunctionOpInterface entryPointFn, /*subgroupSize=*/{}, pipelineConfig); } -/// Set lowering info to be used by the transform dialect jitter. -static LogicalResult -setTransformStrategyRootConfig(mlir::FunctionOpInterface entryPointFn, - linalg::GenericOp genericOp, - const LinalgOpInfo &linalgOpInfo, - const TargetMLTransformInfo &targetMLTransInfo) { - assert(!getLoweringConfig(genericOp) && - "expected lowering_config is not set"); - if (!clEnableTransformDialectJit) - return failure(); - cpu::CPUModel cpuModel; - if (failed( - cpu::matchAndSetReductionStrategy(entryPointFn, genericOp, cpuModel))) - return failure(); - auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( - entryPointFn->getContext(), - IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen); - if (failed(setTranslationInfo(entryPointFn, translationInfo))) - return failure(); - return success(); -} - /// Utility to return the transpose vector `sizes` for X86. Empty `sizes` on /// return indicates failure. static void getTransposeX86VectorSizes( @@ -2284,11 +2255,6 @@ setRootConfig(mlir::FunctionOpInterface entryPointFn, const TargetMLTransformInfo &targetMLTransInfo) { assert(!getLoweringConfig(genericOp) && "expected lowering_config is not set"); - // First, try to apply the transform dialect strategy, if defined. - if (succeeded(setTransformStrategyRootConfig( - entryPointFn, genericOp, linalgOpInfo, targetMLTransInfo))) { - return success(); - } if (succeeded(setTransposeLikeOpRootConfig( entryPointFn, genericOp, linalgOpInfo, targetMLTransInfo))) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp index aeb2b6443a0d..6e64454e0ce3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp @@ -10,16 +10,6 @@ #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h" #include "iree/compiler/Codegen/LLVMCPU/Passes.h" #include "iree/compiler/Codegen/LLVMCPU/Utils.h" -#include "iree/compiler/Dialect/HAL/IR/HALDialect.h" -#include "iree/compiler/Dialect/HAL/IR/HALOps.h" -#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/PDL/IR/PDL.h" -#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" @@ -38,22 +28,7 @@ class LLVMCPUSelectLoweringStrategyPass LLVMCPUSelectLoweringStrategyPass> { public: void getDependentDialects(DialectRegistry ®istry) const override { - // TODO(qedawkins): Once TransformStrategies is deprecated, drop the - // unnecessary dialect registrations. - // clang-format off - registry.insert(); - // clang-format on + registry.insert(); } void runOnOperation() override; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel index 3dd48f9e88fb..b074612adbc5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel @@ -140,7 +140,6 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface", "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions", "//compiler/src/iree/compiler/Codegen/LLVMGPU/Utils", - "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU", "//compiler/src/iree/compiler/Codegen/Transforms", "//compiler/src/iree/compiler/Codegen/Utils", "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt index 8657fc8cf2ce..6a92f60d7f04 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt @@ -183,7 +183,6 @@ iree_cc_library( iree::compiler::Codegen::Interfaces::UKernelOpInterface iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions iree::compiler::Codegen::LLVMGPU::Utils - iree::compiler::Codegen::TransformStrategies::GPU iree::compiler::Codegen::Transforms iree::compiler::Codegen::Utils iree::compiler::Codegen::Utils::VectorOpUtils diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 807b9fdb84db..ff002ace5b0f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -18,7 +18,6 @@ #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h" #include "iree/compiler/Codegen/LLVMGPU/Passes.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" #include "iree/compiler/Codegen/Utils/Utils.h" @@ -63,11 +62,6 @@ llvm::cl::opt clGPUEnableVectorDistribution( llvm::cl::desc("enable the usage of the vector distribution pipeline"), llvm::cl::init(true)); -llvm::cl::opt clGPUEnableTransformDialectJit( - "iree-codegen-llvmgpu-enable-transform-dialect-jit", - llvm::cl::desc("enable the usage of the transform dialect JIT"), - llvm::cl::init(false)); - /// Flag to force using WMMA tensorcore operations. llvm::cl::opt clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma", @@ -1392,57 +1386,6 @@ static LogicalResult setRootDefaultConfig(IREE::GPU::TargetAttr target, preferredSubgroupSize); } -//====---------------------------------------------------------------------===// -// Transform Dialect Pipeline Configuration -//====---------------------------------------------------------------------===// - -/// Set configuration for transform dialect based strategies. -static LogicalResult -setTransformDialectConfig(IREE::GPU::TargetAttr target, - mlir::FunctionOpInterface entryPoint, Operation *op) { - if (!clGPUEnableTransformDialectJit) { - return failure(); - } - - auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( - entryPoint.getContext(), CodeGenPipeline::TransformDialectCodegen); - - // TODO: unify the target informations into one structure. - iree_compiler::gpu::GPUModel gpuModel; - gpuModel.hasWarpShuffle = target.supportsSubgroupShuffle(); - gpuModel.hasTF32TensorCore = target.supportsTF32InputMMAOps(); - gpuModel.hasMmaSync = target.supportsSyncMMAOps(); - - // Populates a subset of the fragment combinations supported in MLIR lowerings - // to NVVM (which is itself a subset of what LLVM supports) based on what the - // pipeline currently supports. - // TODO: avoid hard coding this and populate based on hardware capabilities. - // TODO: add missing supported configs once the pipeline supports it. - MLIRContext *context = entryPoint.getContext(); - Type f32Type = Float32Type::get(context); - Type f16Type = Float16Type::get(context); - - iree_compiler::gpu::MMAConfig f16f32AccConfig = { - /*m=*/16, /*n=*/16, /*k=*/16, - /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f32Type}; - iree_compiler::gpu::MMAConfig f16f16AccConfig = { - /*m=*/16, /*n=*/16, /*k=*/16, - /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f16Type}; - gpuModel.supportedWMMAConfigs = {f16f32AccConfig, f16f16AccConfig}; - - if (target.supportsTF32InputMMAOps()) { - iree_compiler::gpu::MMAConfig tf32WmmaConfig = { - /*m=*/16, /*n=*/16, /*k=*/8, - /*aType=*/f32Type, /*bType=*/f32Type, /*cType=*/f32Type}; - gpuModel.supportedWMMAConfigs.push_back(tf32WmmaConfig); - } - - if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op, - gpuModel))) - return failure(); - return setTranslationInfo(entryPoint, translationInfo); -} - static bool isMatvecLike(linalg::LinalgOp linalgOp) { if (linalgOp.getNumParallelLoops() != 2) return false; @@ -2015,11 +1958,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions()); llvm::dbgs() << "\n"; }); - // First try to see if there is a transform dialect configuration existing. - if (succeeded(setTransformDialectConfig(target, entryPointFn, computeOp))) { - LDBG("Transform Dialect Config"); - return success(); - } if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn, computeOp))) { LDBG("Tile and fuse data tiled multi_mma config"); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp index a6d630717bb6..396bbd96e825 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp @@ -8,18 +8,6 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h" #include "iree/compiler/Codegen/LLVMGPU/Passes.h" -#include "iree/compiler/Dialect/HAL/IR/HALDialect.h" -#include "iree/compiler/Dialect/HAL/IR/HALOps.h" -#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" -#include "mlir/Dialect/PDL/IR/PDL.h" -#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" @@ -41,24 +29,8 @@ class LLVMGPUSelectLoweringStrategyPass final LLVMGPUSelectLoweringStrategyPass>::LLVMGPUSelectLoweringStrategyPassBase; void getDependentDialects(DialectRegistry ®istry) const override { - // TODO(qedawkins): Once TransformStrategies is deprecated, drop the - // unnecessary dialect registrations. - // clang-format off registry - .insert(); - // clang-format on + .insert(); } void runOnOperation() override; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 40945f27454c..00bc6f967acf 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -43,13 +43,8 @@ iree_lit_test_suite( "nvvm_mma_sync_pipeline_test.mlir", "reduction_pipeline_cuda.mlir", "reduction_pipeline_rocm.mlir", - "reduction_pipeline_transform_cuda.mlir", - "reduction_pipeline_transform_rocm.mlir", + "reduction_pipeline_softmax_rocm.mlir", "rocdl_pipeline_test.mlir", - "set_transform_strategy_batch_matmul.mlir", - "set_transform_strategy_convolution.mlir", - "set_transform_strategy_matmul.mlir", - "set_transform_strategy_pad.mlir", "illegal_configuration.mlir", "legalize.mlir", "linalg_transform.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index b771513dd764..6be97c06d533 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -49,13 +49,8 @@ iree_lit_test_suite( "promote_matmul_to_fit_mma.mlir" "reduction_pipeline_cuda.mlir" "reduction_pipeline_rocm.mlir" - "reduction_pipeline_transform_cuda.mlir" - "reduction_pipeline_transform_rocm.mlir" + "reduction_pipeline_softmax_rocm.mlir" "rocdl_pipeline_test.mlir" - "set_transform_strategy_batch_matmul.mlir" - "set_transform_strategy_convolution.mlir" - "set_transform_strategy_matmul.mlir" - "set_transform_strategy_pad.mlir" "tensor_pad.mlir" "tensorcore_vectorization.mlir" "transform_dialect_bufferize.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir index 50d989599819..43a7164ac441 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir @@ -1,7 +1,7 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false --iree-gpu-test-target=sm_60 %s | FileCheck %s +// RUN: --iree-gpu-test-target=sm_60 %s | FileCheck %s // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false --iree-gpu-test-target=sm_80 %s | FileCheck %s --check-prefix=SM80 +// RUN: --iree-gpu-test-target=sm_80 %s | FileCheck %s --check-prefix=SM80 // Transform dialect attributes are tested separately. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir index f292df715093..cfa8875c9685 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir @@ -1,12 +1,10 @@ // RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \ // RUN: --iree-gpu-test-target=sm_60 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ // RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir@__transform_main | \ // RUN: FileCheck %s // RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \ // RUN: --iree-gpu-test-target=sm_60 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ // RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \ // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir index cbab841c3f27..e7aaae07cdcb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s +// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s #pipeline_layout = #hal.pipeline.layout, @@ -37,66 +37,27 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { } } -// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)> -// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info +// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info // CHECK: func.func @warp_reduction_dispatch() // CHECK-SAME: translation_info = #[[TRANSLATION_INFO]] -// CHECK-DAG: %[[C0I:.+]] = arith.constant 0 : i32 -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32 -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : i32 -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : i32 -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : i32 -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32 -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : i32 -// CHECK-DAG: %[[C32I:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index -// CHECK-DAG: %[[C10240:.+]] = arith.constant 10240 : index -// CHECK-DAG: %[[IDENTITY:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[CF:.+]] = arith.constant 1.000000e+00 : f32 -// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32> +// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32> // CHECK-DAG: %[[TID:.+]] = gpu.thread_id x -// CHECK: %[[TID4:.+]] = affine.apply #[[$MAP]]()[%[[TID]]] -// CHECK: %[[R0:.+]] = scf.for %{{.*}} = %[[TID4]] to %[[C10240]] step %[[C1024]] iter_args(%[[A0:.+]] = %[[CST]]) -> (vector<1xf32>) { -// CHECK: %[[V:.+]] = vector.transfer_read {{.*}} {in_bounds = [true]} : memref<512x10240xf32, #hal.descriptor_type>, vector<4xf32> -// CHECK: %[[E:.+]] = vector.extract %[[A0]][0] : f32 from vector<1xf32> -// CHECK: %[[RL:.+]] = vector.reduction , %[[V]], %[[E]] : vector<4xf32> into f32 -// CHECK: %[[B:.+]] = vector.broadcast %[[RL:.*]] : f32 to vector<1xf32> -// CHECK: scf.yield %[[B]] : vector<1xf32> +// CHECK: %[[R0:.+]] = scf.for %{{.*}} = %c0 to %c10240 step %c1024 iter_args(%[[A0:.+]] = %[[CST]]) -> (vector<4xf32>) { +// CHECK: %[[V:.+]] = vector.transfer_read {{.*}} : memref<512x10240xf32, #hal.descriptor_type>, vector<4xf32> +// CHECK: %[[ADD:.+]] = arith.addf %[[V]], %[[A0]] : vector<4xf32> +// CHECK: scf.yield %[[ADD]] : vector<4xf32> // CHECK: } -// CHECK: %[[R1:.+]] = vector.extract %[[R0]][0] : f32 from vector<1xf32> -// CHECK: %[[S0:.+]], %{{.*}} = gpu.shuffle xor %[[R1]], %[[C1]], %[[C32]] : f32 -// CHECK: %[[R2:.+]] = arith.addf %[[R1]], %[[S0]] : f32 -// CHECK: %[[S1:.+]], %{{.*}} = gpu.shuffle xor %[[R2]], %[[C2]], %[[C32]] : f32 -// CHECK: %[[R3:.+]] = arith.addf %[[R2]], %[[S1]] : f32 -// CHECK: %[[S2:.+]], %{{.*}} = gpu.shuffle xor %[[R3]], %[[C4]], %[[C32]] : f32 -// CHECK: %[[R4:.+]] = arith.addf %[[R3]], %[[S2]] : f32 -// CHECK: %[[S3:.+]], %{{.*}} = gpu.shuffle xor %[[R4]], %[[C8]], %[[C32]] : f32 -// CHECK: %[[R5:.+]] = arith.addf %[[R4]], %[[S3]] : f32 -// CHECK: %[[S4:.+]], %{{.*}} = gpu.shuffle xor %[[R5]], %[[C16]], %[[C32]] : f32 -// CHECK: %[[R6:.+]] = arith.addf %[[R5]], %[[S4]] : f32 +// CHECK-COUNT-5: gpu.shuffle xor {{.*}} : f32 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<8xf32, #gpu.address_space> -// CHECK: %[[WID:.+]] = arith.divui %{{.*}}, %{{.*}} : index -// CHECK: %[[LANE_ID:.*]] = arith.remui %[[TID]], %[[C32I]] : index -// CHECK: %[[LANE0:.*]] = arith.cmpi eq, %[[LANE_ID]], %[[C0]] : index -// CHECK: scf.if %[[LANE0]] { -// CHECK: memref.store %[[R6]], %[[ALLOC]][%[[WID]]] : memref<8xf32, #gpu.address_space> +// CHECK: scf.if %{{.*}} { +// CHECK: memref.store %{{.*}}, %[[ALLOC]]{{.*}} : memref<8xf32, #gpu.address_space> // CHECK: } // CHECK: gpu.barrier -// CHECK: %[[LANE_ID_IN_BOUNDS:.*]] = arith.minui %[[LANE_ID]] -// CHECK: %[[LOAD_VAL:.+]] = memref.load %[[ALLOC]][%[[LANE_ID_IN_BOUNDS]]] : memref<8xf32, #gpu.address_space> -// CHECK: %[[S5:.+]], %{{.*}} = gpu.shuffle xor %[[LOAD_VAL]], %[[C1]], %[[C32]] : f32 -// CHECK: %[[R7:.+]] = arith.addf %[[LOAD_VAL]], %[[S5]] : f32 -// CHECK: %[[S6:.+]], %{{.*}} = gpu.shuffle xor %[[R7]], %[[C2]], %[[C32]] : f32 -// CHECK: %[[R8:.+]] = arith.addf %[[R7]], %[[S6]] : f32 -// CHECK: %[[S7:.+]], %{{.*}} = gpu.shuffle xor %[[R8]], %[[C4]], %[[C32]] : f32 -// CHECK: %[[R9:.+]] = arith.addf %[[R8]], %[[S7]] : f32 -// CHECK: %[[S9:.+]], %{{.*}} = gpu.shuffle idx %[[R9]], %[[C0I]], %[[C32]] : f32 -// CHECK: %[[R12:.+]] = arith.addf %[[S9]], %[[CF]] : f32 -// CHECK: %[[R13:.+]] = vector.broadcast %[[R12]] : f32 to vector<1xf32> -// CHECK: %[[TID0:.+]] = arith.cmpi eq, %[[TID]], %[[C0]] : index -// CHECK: scf.if %[[TID0]] { -// CHECK: vector.transfer_write %[[R13]], %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<512xf32, #hal.descriptor_type> +// CHECK: memref.load %[[ALLOC]]{{.*}} : memref<8xf32, #gpu.address_space> +// CHECK-COUNT-3: gpu.shuffle xor {{.*}} : f32 +// CHECK: gpu.shuffle idx {{.*}} : f32 +// CHECK: scf.if %{{.*}} { +// CHECK: vector.transfer_write {{.*}} : vector<1xf32>, memref<512xf32, #hal.descriptor_type> // CHECK: } // ----- @@ -149,43 +110,21 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { } } -// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info +// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info // CHECK: func.func @warp_reduction_broadcast_dispatch() // CHECK-SAME: translation_info = #[[TRANSLATION_INFO]] -// CHECK: scf.for {{.*}} -> (vector<1xf32>) { +// CHECK: scf.for {{.*}} -> (vector<4xf32>) { // CHECK: vector.transfer_read {{.*}} : memref<512x10240xf32, #hal.descriptor_type>, vector<4xf32> -// CHECK: vector.reduction , {{.*}} : vector<4xf32> into f32 +// CHECK: arith.addf {{.*}} : vector<4xf32> // CHECK: scf.yield -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: arith.remui +// CHECK-COUNT-5: gpu.shuffle xor // CHECK: scf.if -// CHECK: memref.store {{.*}} : memref<16xf32, #gpu.address_space> +// CHECK: memref.store {{.*}} : memref<8xf32, #gpu.address_space> // CHECK: } -// CHECK: gpu.barrier -// CHECK: arith.minui -// CHECK: memref.load -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: gpu.shuffle xor -// CHECK: arith.addf -// CHECK: arith.addf -// CHECK: vector.broadcast %{{.*}} : f32 to vector<1xf32> +// CHECK-COUNT-3: gpu.shuffle xor +// CHECK: gpu.shuffle idx +// CHECK: arith.divf {{.*}} : vector<4xf32> // CHECK: scf.for -// CHECK: vector.transfer_read -// CHECK: arith.divf {{.*}} : vector<4x1xf32> // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, memref<512x10240xf32, #hal.descriptor_type> // CHECK: } // CHECK: return @@ -300,3 +239,340 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { // CHECK: vector.transfer_write // CHECK: } // CHECK: return + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @small_reduction { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @small_reduction ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @small_reduction() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 13], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x13xf32> + %3 = tensor.empty() : tensor<1024xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1024xf32>) -> tensor<1024xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<1024x13xf32>) outs(%4 : tensor<1024xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<1024xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xf32> -> !flow.dispatch.tensor> + return + } + } +} +} + +// Small reduction computes the whole reduction on a single thread. +// CHECK-LABEL: func.func @small_reduction +// CHECK: scf.for %{{.*}} = %c0 to %c13 step %c4 +// CHECK: linalg.generic +// CHECK: arith.addf + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_reduction { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @group_reduction ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_reduction() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> + %3 = tensor.empty() : tensor<8xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<8xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> + return + } + } +} +} + +// CHECK-LABEL: func.func @group_reduction +// CHECK: %[[RD:.+]] = vector.transfer_read {{.*}} memref<8x64xf32, #hal.descriptor_type>, vector<2xf32> +// CHECK: %[[ADD:.+]] = arith.addf %[[RD]] +// CHECK: vector.reduction , %[[ADD]] +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf +// CHECK: scf.if +// CHECK: vector.transfer_write {{.*}} memref<8xf32, #hal.descriptor_type> + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_elementwise_reduction_elementwise { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_elementwise_reduction_elementwise() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> + %3 = tensor.empty() : tensor<8xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = arith.addf %in, %in : f32 + %8 = arith.addf %7, %7 : f32 + %9 = arith.addf %8, %out : f32 + linalg.yield %9 : f32 + } -> tensor<8xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<8xf32>) outs(%3 : tensor<8xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.sqrt %in : f32 + linalg.yield %7 : f32 + } -> tensor<8xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> + return + } + } +} +} + +// CHECK-LABEL: func.func @group_elementwise_reduction_elementwise +// CHECK: vector.transfer_read {{.*}} vector<2xf32> +// CHECK: arith.addf{{.*}} : vector<2xf32> +// CHECK: arith.addf{{.*}} : vector<2xf32> +// CHECK: arith.addf{{.*}} : vector<2xf32> +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf +// CHECK: %[[SQRT_VEC:.+]] = math.sqrt +// CHECK: scf.if +// CHECK: vector.transfer_write %[[SQRT_VEC]], {{.*}} : vector<1xf32>, memref<8xf32, #hal.descriptor_type> + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_reduction_larger { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @group_reduction_larger ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_reduction_larger() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x1024xf32> + %3 = tensor.empty() : tensor<33xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<33xf32>) -> tensor<33xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<33x1024xf32>) outs(%4 : tensor<33xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<33xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [33], strides = [1] : tensor<33xf32> -> !flow.dispatch.tensor> + return + } + } +} +} + +// CHECK-LABEL: func.func @group_reduction_larger +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf +// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8xf32, #gpu.address_space> +// CHECK: scf.if +// CHECK: memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<8xf32, #gpu.address_space> +// CHECK: } +// CHECK: arith.minui +// CHECK: memref.load +// CHECK-COUNT-3: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf +// CHECK: %[[RES:.*]], %{{.*}} = gpu.shuffle idx +// CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<1xf32> +// CHECK: scf.if +// CHECK: vector.transfer_write %[[RES_VEC]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_reduction_1d { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_reduction_1d() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> + %3 = tensor.empty() : tensor + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor + flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return + } + } +} +} + +// CHECK-LABEL: func.func @group_reduction_1d +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_elementwise_reduction_elementwise_4d { +hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_elementwise_reduction_elementwise_4d() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8x64xf32> + %3 = tensor.empty() : tensor<2x4x8xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x8xf32>) -> tensor<2x4x8xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], + iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2 : tensor<2x4x8x64xf32>) outs(%4 : tensor<2x4x8xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = arith.addf %in, %in : f32 + %8 = arith.addf %7, %7 : f32 + %9 = arith.addf %8, %out : f32 + linalg.yield %9 : f32 + } -> tensor<2x4x8xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], + iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<2x4x8xf32>) outs(%3 : tensor<2x4x8xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.sqrt %in : f32 + linalg.yield %7 : f32 + } -> tensor<2x4x8xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0], sizes = [2, 4, 8], strides = [1, 1, 1] : tensor<2x4x8xf32> -> !flow.dispatch.tensor> + return + } + } +} +} + +// CHECK-LABEL: func.func @group_elementwise_reduction_elementwise_4d +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +hal.executable private @i4_dequant_matvec { + hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @i4_dequant_matvec() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %9 = tensor.empty() : tensor<4096xf16> + %10 = tensor.empty() : tensor<4096x32x128xf16> + %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> + %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f16 + %16 = arith.subf %15, %in_1 : f16 + %17 = arith.mulf %16, %in_0 : f16 + linalg.yield %17 : f16 + } -> tensor<4096x32x128xf16> + %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %14 = arith.mulf %in, %in_0 : f16 + %15 = arith.addf %14, %out : f16 + linalg.yield %15 : f16 + } -> tensor<4096xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func.func @i4_dequant_matvec() +// CHECK: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16> +// CHECK: %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>) +// CHECK: %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type>, vector<1x8xi4> +// CHECK: %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> +// CHECK: %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> +// CHECK: %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type>, vector<1x8xf16> +// CHECK: %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32> +// CHECK: %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16> +// CHECK: %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16> +// CHECK: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16> +// CHECK: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16> +// CHECK: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16> + +// CHECK: %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16> +// CHECK: vector.reduction , %[[SCAST]] : vector<8xf16> into f16 +// CHECK-COUNT-6: gpu.shuffle xor +// CHECK: scf.if +// CHECK: vector.transfer_write diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir index c46f738d3fa9..fea7846af70b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir @@ -1,84 +1,335 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3 +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \ +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \ +// RUN: %s | FileCheck %s +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \ +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \ +// RUN: %s | FileCheck %s --check-prefix=CDNA3 #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding ]> -func.func @softmax() { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> - %3 = tensor.empty() : tensor<12x128x40960xf32> - %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> - return +hal.executable @group_reduction_1d { +hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_reduction_1d() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> + %3 = tensor.empty() : tensor + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor + flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return + } + } +} } -// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK-LABEL: func.func @softmax -// CHECK-SAME: translation_info = #[[$TRANSLATION]] -// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} +// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CDNA3: func.func @group_reduction_1d() +// CDNA3-SAME: translation_info = #[[$TRANSLATION]] +// CDNA3-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +hal.executable @group_reduction_1d { +hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @group_reduction_1d() { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> + %3 = tensor.empty() : tensor + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor + flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return + } + } +} +} + +// On CDNA, we prefer wave64 with subgroup size of 64. + +// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @group_reduction_1d +// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +hal.executable private @i4_dequant_matvec { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @i4_dequant_matvec() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %9 = tensor.empty() : tensor<4096xf16> + %10 = tensor.empty() : tensor<4096x32x128xf16> + %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> + %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f16 + %16 = arith.subf %15, %in_1 : f16 + %17 = arith.mulf %16, %in_0 : f16 + linalg.yield %17 : f16 + } -> tensor<4096x32x128xf16> + %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %14 = arith.mulf %in, %in_0 : f16 + %15 = arith.addf %14, %out : f16 + linalg.yield %15 : f16 + } -> tensor<4096xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CDNA3: func.func @i4_dequant_matvec() +// CDNA3-SAME: translation_info = #[[$TRANSLATION]] +// CDNA3: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16> +// CDNA3: %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>) +// CDNA3: %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type>, vector<1x8xi4> +// CDNA3: %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> +// CDNA3: %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> +// CDNA3: %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type>, vector<1x8xf16> +// CDNA3: %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32> +// CDNA3: %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16> +// CDNA3: %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16> +// CDNA3: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16> +// CDNA3: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16> +// CDNA3: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16> + +// CDNA3: %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16> +// CDNA3: vector.reduction , %[[SCAST]] : vector<8xf16> into f16 +// CDNA3-COUNT-6: gpu.shuffle xor +// CDNA3: scf.if +// CDNA3: vector.transfer_write + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +hal.executable private @i4_dequant_matvec { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @i4_dequant_matvec() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %9 = tensor.empty() : tensor<4096xf16> + %10 = tensor.empty() : tensor<4096x32x128xf16> + %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> + %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f16 + %16 = arith.subf %15, %in_1 : f16 + %17 = arith.mulf %16, %in_0 : f16 + linalg.yield %17 : f16 + } -> tensor<4096x32x128xf16> + %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %14 = arith.mulf %in, %in_0 : f16 + %15 = arith.addf %14, %out : f16 + linalg.yield %15 : f16 + } -> tensor<4096xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @i4_dequant_matvec() +// CHECK-SAME: translation_info = #[[$TRANSLATION]] // ----- #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding ]> -func.func @softmax() { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> - %3 = tensor.empty() : tensor<12x128x40960xf32> - %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> - return +hal.executable private @matvec_fp16 { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matvec_fp16() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<1x32000xf16> + %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> + %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<1x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> + return + } + } + } } -// On CDNA, we prefer wave64 with subgroup size 64. +// This matvec is expected to be reduced multiple rows at a time by a single workgroup. +// Check that we distribute it across subgroup threads properly. Thread 0 is expected to +// write 8 results at the end. +// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed. -// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CDNA3: func.func @softmax -// CDNA3-SAME: translation_info = #[[$TRANSLATION]] -// CDNA3-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} +// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @matvec_fp16() +// CHECK-SAME: translation_info = #[[$TRANSLATION]] +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index +// CHECK-DAG: %[[C4096:.+]] = arith.constant 4096 : index +// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16> +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>) +// CHECK-DAG: %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type>, vector<8x8xf16> +// CHECK-DAG: %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type>, vector<8x8xf16> +// CHECK: %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16> +// CHECK: %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16> + +// CHECK: vector.reduction , %{{.+}} : vector<8xf16> into f16 +// CHECK-COUNT-24: gpu.shuffle xor +// CHECK: scf.if +// CHECK: vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type> // ----- -#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding ]> -func.func @dynamic_softmax() { - %c32_i64 = arith.constant 32 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 - %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = flow.dispatch.workload.ordinal %6, 0 : index - %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%7} - %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%7} - %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor>{%7} -> tensor<32x?xf16> - %11 = tensor.empty(%7) : tensor<32x?xf16> - %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> - flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%7} - return +hal.executable private @matvec_fp16 { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matvec_fp16() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<1x32000xf16> + %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> + %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<1x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> + return + } + } + } } +// Multi-row matvec with wave32. +// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed. + +// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CDNA3: func.func @matvec_fp16() +// CDNA3-SAME: translation_info = #[[$TRANSLATION]] +// CDNA3-DAG: %[[C0:.+]] = arith.constant 0 : index +// CDNA3-DAG: %[[C512:.+]] = arith.constant 512 : index +// CDNA3-DAG: %[[C4096:.+]] = arith.constant 4096 : index +// CDNA3-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16> +// CDNA3: scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>) +// CDNA3-DAG: %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type>, vector<8x8xf16> +// CDNA3-DAG: %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type>, vector<8x8xf16> +// CDNA3: %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16> +// CDNA3: %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16> -// Finer details of this lowering are captured by the spirv pipeline test. Just -// verify that warp reduction triggers. -// CHECK-LABEL: func.func @dynamic_softmax -// CHECK-COUNT-10: gpu.shuffle xor {{.*}} : i32 +// CDNA3: vector.reduction , %{{.+}} : vector<8xf16> into f16 +// CDNA3-COUNT-24: gpu.shuffle xor +// CDNA3: scf.if +// CDNA3: vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir new file mode 100644 index 000000000000..c46f738d3fa9 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir @@ -0,0 +1,84 @@ +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3 + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @softmax() { + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> + %3 = tensor.empty() : tensor<12x128x40960xf32> + %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> + return +} + +// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK-LABEL: func.func @softmax +// CHECK-SAME: translation_info = #[[$TRANSLATION]] +// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @softmax() { + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> + %3 = tensor.empty() : tensor<12x128x40960xf32> + %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> + return +} + +// On CDNA, we prefer wave64 with subgroup size 64. + +// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CDNA3: func.func @softmax +// CDNA3-SAME: translation_info = #[[$TRANSLATION]] +// CDNA3-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @dynamic_softmax() { + %c32_i64 = arith.constant 32 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = flow.dispatch.workload.ordinal %6, 0 : index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%7} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%7} + %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor>{%7} -> tensor<32x?xf16> + %11 = tensor.empty(%7) : tensor<32x?xf16> + %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> + flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%7} + return +} + + +// Finer details of this lowering are captured by the spirv pipeline test. Just +// verify that warp reduction triggers. +// CHECK-LABEL: func.func @dynamic_softmax +// CHECK-COUNT-10: gpu.shuffle xor {{.*}} : i32 diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir deleted file mode 100644 index e3b16eb22cbc..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir +++ /dev/null @@ -1,565 +0,0 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @small_reduction { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @small_reduction ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @small_reduction() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 13], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x13xf32> - %3 = tensor.empty() : tensor<1024xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1024xf32>) -> tensor<1024xf32> - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<1024x13xf32>) outs(%4 : tensor<1024xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<1024xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xf32> -> !flow.dispatch.tensor> - return - } - } -} -} - -// Small reduction computes the whole reduction on a single thread. -// CHECK-LABEL: func.func @small_reduction -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK-NOT: memref.alloc() -// CHECK: gpu.thread_id x -// CHECK: %[[v:.*]] = scf.for %{{.*}} = %[[C0]] to %[[C12]] step %[[C4]] {{.*}} -> (vector<1xf32>) { -// CHECK: vector.transfer_read {{.*}}: memref<1024x13xf32, #hal.descriptor_type>, vector<1x4xf32> -// CHECK: vector.multi_reduction , %{{.*}} : vector<1x4xf32> to vector<1xf32> -// CHECK: } -// CHECK-NOT: gpu.barrier -// CHECK: %[[r:.*]] = vector.transfer_read {{.*}}: memref<1024x13xf32, #hal.descriptor_type>, vector<1x1xf32> -// CHECK: %[[r1:.*]] = vector.shape_cast %[[r:.*]] : vector<1x1xf32> to vector<1xf32> -// CHECK: arith.addf %[[v]], %[[r1]] : vector<1xf32> - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> - %3 = tensor.empty() : tensor<8xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<8xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> - return - } - } -} -} - -// CHECK-LABEL: func.func @group_reduction -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index -// CHECK-DAG: %[[TIDX:.+]] = gpu.thread_id x - -// No allocation created for the per thread data. -// CHECK-NOT: memref.alloc() - -// Fusion occurred, no barrier before the loop -// CHECK-NOT: gpu.barrier -// Local per-thread scf.for-based reduction. -// CHECK: %[[v:.*]] = scf.for {{.*}} -> (vector<1xf32>) { -// CHECK: vector.transfer_read {{.*}} memref<8x64xf32, #hal.descriptor_type>, vector<1xf32> -// CHECK: arith.addf {{.*}} : vector<1xf32> -// No barrier within the loop. -// CHECK-NOT: gpu.barrier -// CHECK: } -// No store after the loop, the data are kept in register. -// CHECK-NOT: vector.transfer_write -// Barrier after the loop. -// CHECK: gpu.barrier - -// CHECK-NOT: vector.transfer_read -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// CHECK: %[[RES:.*]] = arith.addf %{{.*}} : f32 -// CHECK: %[[RES_VEC:.*]] = vector.broadcast %{{.*}} : f32 to vector<1xf32> -// CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index -// CHECK: scf.if %[[CONDXIS0]] -// CHECK: vector.transfer_write %[[RES_VEC]] -// CHECK: gpu.barrier - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_elementwise_reduction_elementwise { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_elementwise_reduction_elementwise() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> - %3 = tensor.empty() : tensor<8xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<8x64xf32>) outs(%4 : tensor<8xf32>) { - ^bb0(%in: f32, %out: f32): - %7 = arith.addf %in, %in : f32 - %8 = arith.addf %7, %7 : f32 - %9 = arith.addf %8, %out : f32 - linalg.yield %9 : f32 - } -> tensor<8xf32> - %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<8xf32>) outs(%3 : tensor<8xf32>) { - ^bb0(%in: f32, %out: f32): - %7 = math.sqrt %in : f32 - linalg.yield %7 : f32 - } -> tensor<8xf32> - flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> - return - } - } -} -} - -// CHECK-LABEL: func.func @group_elementwise_reduction_elementwise -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index -// CHECK-NOT: memref.alloc() - -// Fusion occurred, no barrier before the loop -// CHECK-NOT: gpu.barrier -// Local per-thread scf.for-based reduction. -// CHECK: %[[TIDX:.+]] = gpu.thread_id x -// CHECK: %[[v:.*]] = scf.for {{.*}} -> (vector<1xf32>) -// CHECK: vector.transfer_read {{.*}} vector<1xf32> -// CHECK: arith.addf{{.*}} : vector<1xf32> -// CHECK: arith.addf{{.*}} : vector<1xf32> -// CHECK: arith.addf{{.*}} : vector<1xf32> -// No barrier within the loop -// CHECK-NOT: gpu.barrier -// CHECK: } -// CHECK-NOT: vector.transfer_write -// Barrier after the loop -// CHECK: gpu.barrier - -// CHECK-NOT: vector.transfer_read -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// CHECK: %[[PARTIAL:.*]] = arith.addf %{{.*}} -// CHECK: %[[BROADCAST:.*]], %{{.*}} = gpu.shuffle idx %[[PARTIAL]] -// CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[BROADCAST]] : f32 to vector<1xf32> -// CHECK: %[[SQRT_VEC:.*]] = math.sqrt %[[RES_VEC]] : vector<1xf32> -// CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index -// CHECK: scf.if %[[CONDXIS0]] -// CHECK: vector.transfer_write %[[SQRT_VEC]], {{.*}} : vector<1xf32>, memref<8xf32, #hal.descriptor_type> -// CHECK: gpu.barrier - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction_larger { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_larger ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction_larger() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x1024xf32> - %3 = tensor.empty() : tensor<33xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<33xf32>) -> tensor<33xf32> - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<33x1024xf32>) outs(%4 : tensor<33xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<33xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [33], strides = [1] : tensor<33xf32> -> !flow.dispatch.tensor> - return - } - } -} -} - -// CHECK-LABEL: func.func @group_reduction_larger -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index -// CHECK-NOT: memref.alloc() - -// Fusion occurred, no barrier before the loop -// CHECK-NOT: gpu.barrier -// Local per-thread scf.for-based reduction. -// CHECK: %[[TIDX:.+]] = gpu.thread_id x -// CHECK: %[[TIDX_TIMES_4:.]] = affine.apply{{.*}}[%[[TIDX]]] -// CHECK: scf.for {{.*}} -> (vector<1xf32>) { -// CHECK: vector.transfer_read {{.*}} vector<4xf32> -// CHECK: vector.reduction {{.*}} : vector<4xf32> into f32 -// CHECK: vector.broadcast {{.*}} : f32 to vector<1xf32> -// No barrier within the loop -// CHECK-NOT: gpu.barrier -// CHECK: } -// CHECK-NOT: vector.transfer_write -// CHECK-NOT: vector.transfer_read -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf -// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8xf32, #gpu.address_space> -// CHECK: scf.if -// CHECK: memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<8xf32, #gpu.address_space> -// CHECK: } -// CHECK: arith.minui -// CHECK: memref.load -// CHECK-COUNT-3: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf -// CHECK: %[[RES:.*]], %{{.*}} = gpu.shuffle idx -// CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<1xf32> -// CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index -// CHECK: scf.if %[[CONDXIS0]] -// CHECK: vector.transfer_write %[[RES_VEC]] -// CHECK: gpu.barrier - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction_1d { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction_1d() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> - %3 = tensor.empty() : tensor - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor - %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor - flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } - } -} -} - -// CHECK-LABEL: func.func @group_reduction_1d -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_elementwise_reduction_elementwise_4d { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_elementwise_reduction_elementwise_4d() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8x64xf32> - %3 = tensor.empty() : tensor<2x4x8xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x8xf32>) -> tensor<2x4x8xf32> - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2 : tensor<2x4x8x64xf32>) outs(%4 : tensor<2x4x8xf32>) { - ^bb0(%in: f32, %out: f32): - %7 = arith.addf %in, %in : f32 - %8 = arith.addf %7, %7 : f32 - %9 = arith.addf %8, %out : f32 - linalg.yield %9 : f32 - } -> tensor<2x4x8xf32> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<2x4x8xf32>) outs(%3 : tensor<2x4x8xf32>) { - ^bb0(%in: f32, %out: f32): - %7 = math.sqrt %in : f32 - linalg.yield %7 : f32 - } -> tensor<2x4x8xf32> - flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0], sizes = [2, 4, 8], strides = [1, 1, 1] : tensor<2x4x8xf32> -> !flow.dispatch.tensor> - return - } - } -} -} - -// CHECK-LABEL: func.func @group_elementwise_reduction_elementwise_4d -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction_i8_12345 { -hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_i8_12345 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction_i8_12345() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0 : i8 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 12345], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x12345xi8> - %3 = tensor.empty() : tensor<8x12345xi8> - %4 = tensor.empty() : tensor<8xi8> - %5 = linalg.fill ins(%cst : i8) outs(%4 : tensor<8xi8>) -> tensor<8xi8> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%2 : tensor<8x12345xi8>) - outs(%5 : tensor<8xi8>) { - ^bb0(%in: i8, %out: i8): - %6 = arith.addi %in, %out : i8 - linalg.yield %6 : i8 - } -> tensor<8xi8> - %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%2, %6 : tensor<8x12345xi8>, tensor<8xi8>) - outs(%3 : tensor<8x12345xi8>) { - ^bb0(%in: i8, %in_0: i8, %out: i8): - %8 = arith.divui %in, %in_0 : i8 - linalg.yield %8 : i8 - } -> tensor<8x12345xi8> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [8, 12345], strides = [1, 1] : tensor<8x12345xi8> -> !flow.dispatch.tensor> - return - } - } -} -} - - -// CHECK-LABEL: func.func @group_reduction_i8_12345 -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index - -// CHECK-DAG: %[[ALLOC0:.+]] = memref.alloc() {alignment = 64 : i64} : memref<1xi8, #gpu.address_space> -// Local per-thread scf.for-based reduction. -// CHECK: %[[TIDX:.+]] = gpu.thread_id x -// CHECK: scf.for {{.*}} -> (vector<1xi8>) -// CHECK: vector.transfer_read {{.*}} vector<1xi8> -// CHECK: arith.addi{{.*}} : vector<1xi8> -// CHECK-NOT: vector.transfer_write -// No barrier within the loop -// CHECK-NOT: gpu.barrier -// CHECK: } -// CHECK-NOT: vector.transfer_write -// Barrier after the loop -// CHECK: gpu.barrier - -// CHECK-NOT: vector.transfer_read -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}arith.trunci{{.*}}{{[[:space:]].*}}arith.addi{{.*}}i8 -// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32xi8, #gpu.address_space> -// CHECK: scf.if -// CHECK: memref.store %{{.*}}, %[[ALLOC]][%{{.*}}] : memref<32xi8, #gpu.address_space> -// CHECK: } -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}arith.trunci{{.*}}{{[[:space:]].*}}arith.addi{{.*}}i8 - -// CHECK: %[[RES_VEC:.*]] = vector.broadcast %{{.+}} : i8 to vector<1xi8> -// CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index -// CHECK: scf.if %[[CONDXIS0]] -// CHECK: vector.transfer_write %[[RES_VEC]], %[[ALLOC0]][%[[C0]]] {in_bounds = [true]} : vector<1xi8>, memref<1xi8, #gpu.address_space> - -// CHECK: gpu.barrier -// CHECK: arith.divui {{.*}} vector<8xi8> -// CHECK: arith.divui {{.*}} i8 -// CHECK: gpu.barrier - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d0)> -hal.executable @reduction_2d_trailing_elementwise_static_dispatch_0 { - hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { - hal.executable.export public @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x10xf32> - %3 = tensor.empty() : tensor<128x10xf32> - %4 = tensor.empty() : tensor<128xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<128xf32>) -> tensor<128xf32> - %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<128x10xf32>) outs(%5 : tensor<128xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.addf %in, %out : f32 - linalg.yield %8 : f32 - } -> tensor<128xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<128x10xf32>, tensor<128xf32>) outs(%3 : tensor<128x10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.divf %in, %in_0 : f32 - linalg.yield %8 : f32 - } -> tensor<128x10xf32> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [128, 10], strides = [1, 1] : tensor<128x10xf32> -> !flow.dispatch.tensor> - return - } - } - } -} - -// CHECK-LABEL: func.func @reduction_2d_trailing_elementwise_static_dispatch_0 -// CHECK-NOT: gpu.shuffle -// -// Loop vector<4> + tail vector<2> reduction part run sequentially. -// CHECK: scf.for {{.*}} -> (vector<1xf32>) { -// CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type>, vector<1x4xf32> -// CHECK: vector.multi_reduction , {{.*}} [1] : vector<1x4xf32> to vector<1xf32> -// CHECK: scf.yield %{{.*}} : vector<1xf32> -// CHECK: } -// CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type>, vector<1x2xf32> -// CHECK: vector.multi_reduction , {{.*}} [1] : vector<1x2xf32> to vector<1xf32> -// CHECK: vector.broadcast {{.*}} : vector<1xf32> to vector<1x4xf32> -// -// Loop vector<4> + tail vector<2> writeback part run sequentially. -// CHECK: scf.for {{.*}} { -// CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<128x10xf32, #hal.descriptor_type>, vector<1x4xf32> -// CHECK: arith.divf {{.*}} : vector<1x4xf32> -// CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x8xf32, strided<[10, 1], offset: ?>, #hal.descriptor_type> -// CHECK: } -// CHECK: vector.broadcast {{.*}} : vector<1xf32> to vector<1x2xf32> -// CHECK: arith.divf {{.*}} : vector<1x2xf32> -// CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x2xf32>, memref<1x10xf32, strided<[10, 1], offset: ?>, #hal.descriptor_type> -// CHECK: gpu.barrier - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable private @i4_dequant_matvec { - hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @i4_dequant_matvec() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %9 = tensor.empty() : tensor<4096xf16> - %10 = tensor.empty() : tensor<4096x32x128xf16> - %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> - %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f16 - %16 = arith.subf %15, %in_1 : f16 - %17 = arith.mulf %16, %in_0 : f16 - linalg.yield %17 : f16 - } -> tensor<4096x32x128xf16> - %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %14 = arith.mulf %in, %in_0 : f16 - %15 = arith.addf %14, %out : f16 - linalg.yield %15 : f16 - } -> tensor<4096xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// CHECK-LABEL: func.func @i4_dequant_matvec() -// CHECK: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16> -// CHECK: %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>) -// CHECK: %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type>, vector<1x8xi4> -// CHECK: %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> -// CHECK: %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> -// CHECK: %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type>, vector<1x8xf16> -// CHECK: %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32> -// CHECK: %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16> -// CHECK: %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16> -// CHECK: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16> -// CHECK: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16> -// CHECK: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16> - -// CHECK: %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16> -// CHECK: vector.reduction , %[[SCAST]] : vector<8xf16> into f16 -// CHECK-COUNT-6: gpu.shuffle xor -// CHECK: scf.if -// CHECK: vector.transfer_write diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir deleted file mode 100644 index fea7846af70b..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir +++ /dev/null @@ -1,335 +0,0 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \ -// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \ -// RUN: %s | FileCheck %s -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \ -// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \ -// RUN: %s | FileCheck %s --check-prefix=CDNA3 - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction_1d { -hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction_1d() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> - %3 = tensor.empty() : tensor - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor - %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor - flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } - } -} -} - -// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CDNA3: func.func @group_reduction_1d() -// CDNA3-SAME: translation_info = #[[$TRANSLATION]] -// CDNA3-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -hal.executable @group_reduction_1d { -hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @group_reduction_1d() { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> - %3 = tensor.empty() : tensor - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor - %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<64xf32>) outs(%4 : tensor) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor - flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } - } -} -} - -// On CDNA, we prefer wave64 with subgroup size of 64. - -// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: func.func @group_reduction_1d -// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable private @i4_dequant_matvec { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @i4_dequant_matvec() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %9 = tensor.empty() : tensor<4096xf16> - %10 = tensor.empty() : tensor<4096x32x128xf16> - %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> - %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f16 - %16 = arith.subf %15, %in_1 : f16 - %17 = arith.mulf %16, %in_0 : f16 - linalg.yield %17 : f16 - } -> tensor<4096x32x128xf16> - %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %14 = arith.mulf %in, %in_0 : f16 - %15 = arith.addf %14, %out : f16 - linalg.yield %15 : f16 - } -> tensor<4096xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CDNA3: func.func @i4_dequant_matvec() -// CDNA3-SAME: translation_info = #[[$TRANSLATION]] -// CDNA3: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16> -// CDNA3: %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>) -// CDNA3: %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type>, vector<1x8xi4> -// CDNA3: %[[READ1:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> -// CDNA3: %[[READ2:.+]] = vector.transfer_read {{.+}} : memref<4096x32xf16, #hal.descriptor_type>, vector<1x8xf16> -// CDNA3: %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<32x128xf16, #hal.descriptor_type>, vector<1x8xf16> -// CDNA3: %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<1x8xi4> to vector<1x8xi32> -// CDNA3: %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<1x8xi32> to vector<1x8xf16> -// CDNA3: %[[SUB:.+]] = arith.subf %[[CVT]], %[[READ1]] : vector<1x8xf16> -// CDNA3: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[READ2]] : vector<1x8xf16> -// CDNA3: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<1x8xf16> -// CDNA3: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[ARG]] : vector<1x8xf16> - -// CDNA3: %[[SCAST:.+]] = vector.shape_cast %[[FOR]] : vector<1x8xf16> to vector<8xf16> -// CDNA3: vector.reduction , %[[SCAST]] : vector<8xf16> into f16 -// CDNA3-COUNT-6: gpu.shuffle xor -// CDNA3: scf.if -// CDNA3: vector.transfer_write - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable private @i4_dequant_matvec { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @i4_dequant_matvec() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %9 = tensor.empty() : tensor<4096xf16> - %10 = tensor.empty() : tensor<4096x32x128xf16> - %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> - %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f16 - %16 = arith.subf %15, %in_1 : f16 - %17 = arith.mulf %16, %in_0 : f16 - linalg.yield %17 : f16 - } -> tensor<4096x32x128xf16> - %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %14 = arith.mulf %in, %in_0 : f16 - %15 = arith.addf %14, %out : f16 - linalg.yield %15 : f16 - } -> tensor<4096xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: func.func @i4_dequant_matvec() -// CHECK-SAME: translation_info = #[[$TRANSLATION]] - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable private @matvec_fp16 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @matvec_fp16() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<1x32000xf16> - %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> - %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<1x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// This matvec is expected to be reduced multiple rows at a time by a single workgroup. -// Check that we distribute it across subgroup threads properly. Thread 0 is expected to -// write 8 results at the end. -// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed. - -// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: func.func @matvec_fp16() -// CHECK-SAME: translation_info = #[[$TRANSLATION]] -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index -// CHECK-DAG: %[[C4096:.+]] = arith.constant 4096 : index -// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16> -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>) -// CHECK-DAG: %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type>, vector<8x8xf16> -// CHECK-DAG: %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type>, vector<8x8xf16> -// CHECK: %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16> -// CHECK: %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16> - -// CHECK: vector.reduction , %{{.+}} : vector<8xf16> into f16 -// CHECK-COUNT-24: gpu.shuffle xor -// CHECK: scf.if -// CHECK: vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type> - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable private @matvec_fp16 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @matvec_fp16() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<1x32000xf16> - %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> - %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<1x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// Multi-row matvec with wave32. -// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed. - -// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// CDNA3: func.func @matvec_fp16() -// CDNA3-SAME: translation_info = #[[$TRANSLATION]] -// CDNA3-DAG: %[[C0:.+]] = arith.constant 0 : index -// CDNA3-DAG: %[[C512:.+]] = arith.constant 512 : index -// CDNA3-DAG: %[[C4096:.+]] = arith.constant 4096 : index -// CDNA3-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<8x8xf16> -// CDNA3: scf.for %{{.+}} = %[[C0]] to %[[C4096]] step %[[C512]] iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<8x8xf16>) -// CDNA3-DAG: %[[MAT:.+]] = vector.transfer_read {{.+}} : memref<32000x4096xf16, #hal.descriptor_type>, vector<8x8xf16> -// CDNA3-DAG: %[[VEC:.+]] = vector.transfer_read {{.+}} : memref<1x4096xf16, #hal.descriptor_type>, vector<8x8xf16> -// CDNA3: %[[MUL:.+]] = arith.mulf %[[VEC]], %[[MAT]] : vector<8x8xf16> -// CDNA3: %[[ADD:.+]] = arith.addf %[[ARG]], %[[MUL]] : vector<8x8xf16> - -// CDNA3: vector.reduction , %{{.+}} : vector<8xf16> into f16 -// CDNA3-COUNT-24: gpu.shuffle xor -// CDNA3: scf.if -// CDNA3: vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir deleted file mode 100644 index 33d5f29e93be..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir +++ /dev/null @@ -1,191 +0,0 @@ -// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy |\ -// RUN: FileCheck %s --check-prefixes=CHECK,DEFAULT - -// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy \ -// RUN: -td-matmul-strategy-blk-sizes=128,64,32,2 \ -// RUN: -td-matmul-strategy-reduc-size=8 \ -// RUN: -td-matmul-strategy-num-threads=32,4,1 \ -// RUN: -td-matmul-strategy-num-warps=1,4,1 \ -// RUN: -td-matmul-strategy-use-async-copies=true \ -// RUN: -td-matmul-strategy-pipeline-depth=3 \ -// RUN: -td-matmul-strategy-use-mma-sync=false \ -// RUN: -td-matmul-strategy-use-fma=true \ -// RUN: | FileCheck %s --check-prefixes=CHECK,OPTIONS - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> - %5 = tensor.empty() : tensor<128x80x320xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.mulf %in, %in_0 : f32 - %9 = arith.addf %out, %8 : f32 - linalg.yield %9 : f32 - } -> tensor<128x80x320xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK: transform.named_sequence -// CHECK: transform.iree.register_match_callbacks -// CHECK: %[[MATCH:.+]]:2 = transform.iree.match_callback failures(propagate) "batch_matmul" -// CHECK: %[[TILED:.+]], %[[FORALL:.+]] = transform.structured.tile_using_forall %[[MATCH]]#1 -// DEFAULT: tile_sizes [64, 64, 1](mapping = [#gpu.block, #gpu.block, #gpu.block]) -// OPTIONS: tile_sizes [128, 64, 32](mapping = [#gpu.block, #gpu.block, #gpu.block]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: %[[FUSED:.+]], %[[CONTAINING:.+]] = transform.structured.fuse_into_containing_op %[[MATCH]]#0 into %[[FORALL]] -// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice %[[FORALL]] -// CHECK: %[[TILED_LINALG:.+]], %[[LOOPS:.+]] = transform.structured.tile_using_for %tiled_op -// DEFAULT: [0, 0, 0, 16] -// OPTIONS: [0, 0, 0, 8] -// CHECK: %[[PADDED:.+]], %{{.*}}, %{{.+}} = transform.structured.pad %tiled_linalg_op pad_to_multiple_of [1, 1, 1, 1] -// CHECK: nofold_flags = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3] -// CHECK: padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} -// CHECK: %[[V3:.+]] = transform.get_producer_of_operand %[[PADDED]][2] -// CHECK: transform.structured.hoist_pad %{{.*}} by 1 loops -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: %[[FILL:.+]] = transform.structured.match ops{["linalg.fill"]} -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.structured.match ops{["tensor.parallel_insert_slice"]} -// CHECK: transform.structured.insert_slice_to_copy -// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %[[PADDED]][0] -// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %[[PADDED]][1] -// CHECK: %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]] - -// CHECK: transform.structured.tile_using_forall %[[LHS]] -// DEFAULT: num_threads [1, 32, 4](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// OPTIONS: num_threads [1, 64, 2](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.structured.match ops{["scf.if"]} -// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch - -// CHECK: transform.structured.tile_using_forall %[[RHS_DPS]] -// DEFAULT: num_threads [8, 16, 1](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// OPTIONS: num_threads [2, 8, 8](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// CHECK: transform.structured.tile_using_forall -// DEFAULT: num_threads [2, 64, 1](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// OPTIONS: num_threads [1, 16, 8](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// CHECK: transform.structured.tile_using_forall -// DEFAULT: num_threads [1, 2, 64](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// OPTIONS: num_threads [1, 4, 32](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// CHECK: %tiled_op_8, %forall_op_9 = transform.structured.tile_using_forall %[[FILL]] -// DEFAULT: num_threads [1, 2, 64](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// OPTIONS: num_threads [1, 4, 32](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// CHECK: transform.structured.vectorize -// DEFAULT: vector_sizes [64, 2, 4] -// OPTIONS: vector_sizes [128, 1, 4] -// CHECK: transform.structured.vectorize -// DEFAULT: vector_sizes [32, 1, 1] -// OPTIONS: vector_sizes [128, 4, 4] -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.vector.lower_masked_transfers -// CHECK: transform.structured.vectorize_children_and_apply_patterns -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.canonicalization -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.eliminate_empty_tensors - -// CHECK: transform.iree.bufferize {target_gpu} -// CHECK: transform.memref.erase_dead_alloc_and_stores -// CHECK: transform.iree.forall_to_workgroup -// CHECK: transform.iree.map_nested_forall_to_gpu_threads -// DEFAULT: workgroup_dims = [64, 2, 1] -// OPTIONS: workgroup_dims = [32, 4, 1] -// CHECK: transform.iree.eliminate_gpu_barriers -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.hoist_static_alloc -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.memref.extract_address_computations -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.linalg.tiling_canonicalization -// CHECK: transform.apply_patterns.iree.fold_fill_into_pad -// CHECK: transform.apply_patterns.scf.for_loop_canonicalization -// CHECK: transform.apply_patterns.canonicalization -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.synchronize_loop -// CHECK: transform.structured.hoist_redundant_vector_transfers -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.memref.erase_dead_alloc_and_stores -// CHECK: transform.iree.eliminate_gpu_barriers -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops - -// CHECK: transform.memref.multibuffer -// DEFAULT: factor = 2 -// OPTIONS: factor = 3 -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.vector.transfer_to_scf full_unroll = true -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.create_async_groups -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.pipeline_shared_memory_copies -// DEFAULT: depth = 2 -// OPTIONS: depth = 3 -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.vector.lower_masks -// CHECK: apply_patterns -// CHECK: transform.apply_patterns.vector.materialize_masks -// CHECK: apply_patterns -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir deleted file mode 100644 index 6ab8221d5351..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir +++ /dev/null @@ -1,130 +0,0 @@ -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit= --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy | FileCheck %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @nchw_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x128x258x258xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<256x128x3x3xf32> - %5 = tensor.empty() : tensor<8x256x256x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @nchw_convolution - -// CHECK: transform.named_sequence -// CHECK: transform.iree.match_callback failures(propagate) "convolution" -// CHECK: transform.structured.convert_conv2d_to_img2col -// CHECK: transform.get_producer_of_operand %{{.*}}[0] -// CHECK: transform.apply_patterns.iree.bubble_collapse -// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) -// CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice %{{.*}} -// CHECK: transform.structured.match ops{["linalg.fill"]} -// CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 0, 16] -// CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 0, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} -// CHECK: transform.structured.match ops{["linalg.fill"]} -// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2] -// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]] -// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0] -// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1] -// CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]] -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %[[RHS]] num_threads [1, 4, 32](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface -// CHECK: transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices -// CHECK: transform.apply_patterns.vector.cast_away_vector_leading_one_dim -// CHECK: transform.structured.vectorize_children_and_apply_patterns %{{.*}} {vectorize_nd_extract} -// CHECK: transform.iree.eliminate_empty_tensors -// CHECK: transform.iree.bufferize {target_gpu} -// CHECK: transform.memref.erase_dead_alloc_and_stores -// CHECK: transform.iree.forall_to_workgroup -// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] -// CHECK: transform.iree.hoist_static_alloc %{{.*}} -// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK: transform.apply_patterns.memref.extract_address_computations -// CHECK: transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync -// CHECK: transform.structured.hoist_redundant_vector_transfers -// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync} - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @nhwc_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x128xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x128x256xf32> - %5 = tensor.empty() : tensor<8x256x256x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @nhwc_convolution - -// CHECK: transform.named_sequence -// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) -// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1, 1] {copy_back_op = "none", nofold_flags = [0, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} -// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2] -// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]] -// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0] -// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1] -// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]] -// CHECK: transform.structured.tile_using_forall %[[LHS]] num_threads [1, 32, 4](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @unaligned_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x132xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x132x264xf32> - %5 = tensor.empty() : tensor<8x256x256x264xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK: #iree_codegen.translation_info -// CHECK-LABEL: func @unaligned_convolution - -// Currently padding on the img2col op is not supported so bail out for unaligned. -// CHECK-NOT: transform.named_sequence diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir deleted file mode 100644 index 8943709e1c13..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir +++ /dev/null @@ -1,522 +0,0 @@ -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul | FileCheck %s - -// Check that setting the command line options affect the transform -// strategy as expected. -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 \ -// RUN: -td-matmul-strategy-blk-sizes=256,64,1 \ -// RUN: -td-matmul-strategy-reduc-size=8 \ -// RUN: -td-matmul-strategy-num-threads=32,4,1 \ -// RUN: -td-matmul-strategy-num-warps=1,4,1 \ -// RUN: -td-matmul-strategy-use-async-copies=true \ -// RUN: -td-matmul-strategy-use-mma-sync=true \ -// RUN: -td-matmul-strategy-pipeline-depth=5 \ -// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s - -// Check that various more exotic strategies apply properly e2e but without otherwise checking their content. -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \ -// RUN: -td-matmul-strategy-blk-sizes=16,16,1 \ -// RUN: -td-matmul-strategy-reduc-size=16 \ -// RUN: -td-matmul-strategy-num-threads=32,1,1 \ -// RUN: -td-matmul-strategy-num-warps=1,1,1 \ -// RUN: -td-matmul-strategy-use-async-copies=true \ -// RUN: -td-matmul-strategy-use-mma-sync=true \ -// RUN: -td-matmul-strategy-pipeline-depth=9 \ -// RUN: | FileCheck --check-prefix=WITH_OPTIONS_2 %s - -// Check that various more exotic strategies apply properly e2e but without otherwise checking their content. -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \ -// RUN: -td-matmul-strategy-blk-sizes=128,64,1 \ -// RUN: -td-matmul-strategy-reduc-size=16 \ -// RUN: -td-matmul-strategy-num-threads=128,2,1 \ -// RUN: -td-matmul-strategy-num-warps=1,8,1 \ -// RUN: -td-matmul-strategy-use-async-copies=true \ -// RUN: -td-matmul-strategy-use-mma-sync=true \ -// RUN: -td-matmul-strategy-pipeline-depth=3 \ -// RUN: | FileCheck --check-prefix=WITH_OPTIONS_3 %s - -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-small-matmul | FileCheck --check-prefix=SMALL %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul_1() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> - %5 = tensor.empty() : tensor<2052x2052xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @matmul_1 - -// CHECK: transform.named_sequence -// CHECK: transform.iree.match_callback failures(propagate) "matmul" -// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) -// CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice -// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16] -// CHECK: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} -// CHECK: transform.structured.hoist_pad %{{.}} by 1 loops -// CHECK: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [32, 4] -// CHECK: transform.apply_patterns.vector.lower_masked_transfers -// CHECK: transform.structured.vectorize_children_and_apply_patterns %{{.*}} -// CHECK: transform.iree.eliminate_empty_tensors %{{.*}} -// CHECK: transform.iree.bufferize {target_gpu} %{{.*}} -// CHECK: transform.iree.forall_to_workgroup %{{.*}} -// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] -// CHECK: transform.iree.hoist_static_alloc %{{.*}} -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK: } : !transform.any_op -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.memref.extract_address_computations -// CHECK: } : !transform.any_op -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync -// CHECK: } : !transform.any_op -// CHECK: transform.structured.match ops{["scf.for"]} in %{{.*}} -// CHECK: transform.iree.synchronize_loop %{{.*}} -// CHECK: transform.structured.hoist_redundant_vector_transfers %{{.*}} -// CHECK: transform.memref.erase_dead_alloc_and_stores %{{.*}} -// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync} -// CHECK: transform.iree.eliminate_gpu_barriers -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK: } : !transform.any_op -// CHECK: transform.memref.multibuffer %{{.*}} {factor = 3 : i64, skip_analysis} -// CHECK: transform.apply_patterns.vector.transfer_to_scf full_unroll = true -// CHECK: transform.iree.create_async_groups %{{.*}} {use_mma_sync} -// CHECK: transform.iree.pipeline_shared_memory_copies %{{.*}} {depth = 3 : i64, use_mma_sync} -// CHECK: transform.apply_patterns.vector.lower_masks -// CHECK: transform.apply_patterns.vector.materialize_masks -// CHECK: apply_patterns to %{{.*}} { -// CHECK-DAG: transform.apply_patterns.linalg.tiling_canonicalization -// CHECK-DAG: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK-DAG: transform.apply_patterns.canonicalization -// CHECK: } : !transform.any_op -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// WITH_OPTIONS-LABEL: func @matmul_1 - -// WITH_OPTIONS: transform.named_sequence -// WITH_OPTIONS: transform.iree.match_callback failures(propagate) "matmul" -// Tile sizes are set by td-matmul-strategy-blk-size-XX. -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} tile_sizes [256, 64](mapping = [#gpu.block, #gpu.block]) -// WITH_OPTIONS: transform.structured.fuse_into_containing_op -// WITH_OPTIONS: transform.iree.populate_workgroup_count_region_using_num_threads_slice -// The tiling is affected by td-matmul-strategy-reduc-size: 8. -// WITH_OPTIONS: transform.structured.tile_using_for %{{.*}}[0, 0, 8] -// WITH_OPTIONS: transform.structured.pad %{{.*}} pad_to_multiple_of [1, 1, 1] {copy_back_op = "none", nofold_flags = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} -// WITH_OPTIONS: transform.structured.hoist_pad %{{.}} by 1 loops -// WITH_OPTIONS: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [64, 2](mapping = [#gpu.thread, #gpu.thread]) -// WITH_OPTIONS: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) -// WITH_OPTIONS: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [4, 1](mapping = [#gpu.warp, #gpu.warp]) -// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [4, 1](mapping = [#gpu.warp, #gpu.warp]) -// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] -// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [1, 4] -// WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [32, 4] -// WITH_OPTIONS: transform.apply_patterns.vector.lower_masked_transfers -// WITH_OPTIONS: transform.structured.vectorize_children_and_apply_patterns %{{.*}} -// WITH_OPTIONS: transform.iree.eliminate_empty_tensors %{{.*}} -// WITH_OPTIONS: transform.iree.bufferize {target_gpu} %{{.*}} -// WITH_OPTIONS: transform.iree.forall_to_workgroup %{{.*}} -// The workgroup dimensions are controled by td-matmul-strategy-num-threads-XX. -// The warp dimensions are controled by td-matmul-strategy-num-warps-XX. -// WITH_OPTIONS: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [32, 4, 1] -// WITH_OPTIONS: transform.iree.hoist_static_alloc %{{.*}} -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.memref.fold_memref_alias_ops -// WITH_OPTIONS: } : !transform.any_op -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.memref.extract_address_computations -// WITH_OPTIONS: } : !transform.any_op -// The unroll attribute should match td-matmul-use-mma-sync, for true: mma_sync, -// for false:_wmma. -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync -// WITH_OPTIONS: } -// WITH_OPTIONS: transform.structured.match ops{["scf.for"]} in %{{.*}} -// WITH_OPTIONS: transform.iree.synchronize_loop %{{.*}} -// WITH_OPTIONS: transform.structured.hoist_redundant_vector_transfers %{{.*}} -// WITH_OPTIONS: transform.memref.erase_dead_alloc_and_stores %{{.*}} -// The attribute should match td-matmul-use-mma-sync. -// WITH_OPTIONS: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_mma_sync} -// WITH_OPTIONS: transform.iree.eliminate_gpu_barriers -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.memref.fold_memref_alias_ops -// WITH_OPTIONS: } : !transform.any_op -// The multibuffer pass is only run when we set use-async-copies. -// The factor should match td-matmul-strategy-pipeline-depth: 5. -// WITH_OPTIONS: transform.memref.multibuffer %{{.*}} {factor = 5 : i64, skip_analysis} -// WITH_OPTIONS: transform.apply_patterns.vector.transfer_to_scf full_unroll = true -// The attribute should match td-matmul-use-mma-sync. -// WITH_OPTIONS: transform.iree.create_async_groups %{{.*}} {use_mma_sync} -// The depth should match td-matmul-strategy-pipeline-depth: 5. -// WITH_OPTIONS: transform.iree.pipeline_shared_memory_copies %{{.*}} {depth = 5 : i64, use_mma_sync} -// WITH_OPTIONS: transform.apply_patterns.vector.lower_masks -// WITH_OPTIONS: transform.apply_patterns.vector.materialize_masks -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.linalg.tiling_canonicalization -// WITH_OPTIONS: transform.apply_patterns.memref.fold_memref_alias_ops -// WITH_OPTIONS: } : !transform.any_op -// WITH_OPTIONS: apply_patterns to %{{.*}} { -// WITH_OPTIONS: transform.apply_patterns.canonicalization -// WITH_OPTIONS } -// WITH_OPTIONS: transform.iree.apply_licm -// WITH_OPTIONS: transform.apply_cse to - - -// WITH_OPTIONS_2-LABEL: func @matmul_1 - -// WITH_OPTIONS_3-LABEL: func @matmul_1 - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul_2() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2051x2555xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2555x2050xf32> - %5 = tensor.empty() : tensor<2051x2050xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @matmul_2 - -// CHECK: transform.named_sequence -// CHECK: transform.iree.match_callback failures(propagate) "matmul" -// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) -// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice -// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16] -// align1 -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) -// align2 -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 64](mapping = [#gpu.thread, #gpu.thread]) -// align2 -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 64](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// align1 -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [16, 1] -// align2 -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [8, 2] -// align2 -// CHECK: transform.structured.vectorize %{{.*}} vector_sizes [64, 2] - -// WITH_OPTIONS_2-LABEL: func @matmul_2 - -// WITH_OPTIONS_3-LABEL: func @matmul_2 - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul_3() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2556xf32> - %5 = tensor.empty() : tensor<2048x2556xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @matmul_3 - -// CHECK: transform.named_sequence - -// WITH_OPTIONS_2-LABEL: func @matmul_3 - -// WITH_OPTIONS_3-LABEL: func @matmul_3 - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul_4_partially_unaligned() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2044xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> - %5 = tensor.empty() : tensor<2048x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @matmul_4_partially_unaligned - -// CHECK: transform.structured.tile_using_for %tiled_op tile_sizes [0, 0, 16] - -// Make sure we do not canonicalize because the result is still aligned. -// CHECK-NEXT: transform.structured.pad %tiled_linalg_op -// CHECK-SAME: copy_back_op = "none" -// CHECK-SAME: nofold_flags = [1, 1, 1] -// CHECK-SAME: padding_dimensions = [0, 1, 2] -// CHECK-SAME: padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32] -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2] -// CHECK: %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]] -// CHECK: %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0] -// CHECK: %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1] -// CHECK: %[[TILED_LHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[LHS_PAD]] num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.match ops{["scf.if"]} -// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch -// CHECK: %[[TILED_RHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[RHS_PAD]] num_threads [4, 32](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.match ops{["scf.if"]} -// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// alignLhs -// CHECK: transform.structured.vectorize %[[TILED_LHS]] vector_sizes [4, 4] -// alignRhs -// CHECK: transform.structured.vectorize %[[TILED_RHS]] vector_sizes [4, 4] - -// CHECK: transform.apply_patterns.vector.lower_masks -// CHECK: transform.apply_patterns.vector.materialize_masks - -// WITH_OPTIONS_2-LABEL: func @matmul_4_partially_unaligned - -// WITH_OPTIONS_3-LABEL: func @matmul_4_partially_unaligned - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @aligned_matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> - %5 = tensor.empty() : tensor<2048x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @aligned_matmul - -// Block level is the same for aligned. -// CHECK: transform.structured.tile_using_for %tiled_op tile_sizes [0, 0, 16] - -// Make sure we do not canonicalize if the result is aligned to avoid folding the extract_slice on the iterator. -// CHECK-NEXT: transform.structured.pad %tiled_linalg_op -// CHECK-SAME: copy_back_op = "none" -// CHECK-SAME: nofold_flags = [1, 1, 1] -// CHECK-SAME: padding_dimensions = [0, 1, 2] -// CHECK-SAME: padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32] - -// Canonicalization is currently required here to enable pad to dps to produce linalg.copy ops. -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2] -// CHECK: %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]] -// CHECK: %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0] -// CHECK: %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1] -// CHECK: %[[LHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[LHS_PAD]] -// CHECK: %[[RHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS_PAD]] -// CHECK: transform.structured.tile_using_forall %[[LHS_COPY]] num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %[[RHS_COPY]] num_threads [4, 32](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// Verify we don't go down the path without the flag. -// WITH_OPTIONS-LABEL: func @aligned_matmul - -// WITH_OPTIONS-NOT: transform.sequence -// WITH_OPTIONS-NOT: transform.named_sequence - -// WITH_OPTIONS_2-LABEL: func @aligned_matmul - -// WITH_OPTIONS_3-LABEL: func @aligned_matmul - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul_5_small() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x2044xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> - %5 = tensor.empty() : tensor<2x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK: iree_codegen.translation_info -// CHECK-LABEL: func @matmul_5_small - -// This matmul is considered "too small"/"degenerate" for a tensor core strategy, -// just fallback to the vectorized strategy. - -// WITH_OPTIONS_2-LABEL: func @matmul_5_small - -// WITH_OPTIONS_3-LABEL: func @matmul_5_small - -// SMALL-LABEL: func @matmul_5_small -// SMALL: transform.named_sequence -// SMALL-NOT: mma -// SMALL-NOT: wmma - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @f16_matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf16> - %5 = tensor.empty() : tensor<2052x2052xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor> - return -} - -// CHECK: iree_codegen.translation_info -// CHECK-LABEL: func @f16_matmul -// CHECK-NOT: transform.sequence -// CHECK-NOT: transform.named_sequence - -// WITH_OPTIONS_2-LABEL: func @f16_matmul - -// WITH_OPTIONS_3-LABEL: func @f16_matmul - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @int8_matmul() { - %c0 = arith.constant 0 : index - %c0_i8 = arith.constant 0 : i8 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x2556xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xi8> - %5 = tensor.empty() : tensor<4x2052xi8> - %6 = linalg.fill ins(%c0_i8 : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8> - %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor> - return -} - -// SMALL-LABEL: func @int8_matmul -// SMALL: transform.named_sequence -// SMALL-NOT: mma -// SMALL-NOT: wmma - -// CHECK-LABEL: func @int8_matmul -// CHECK-NOT: transform.sequence -// CHECK-NOT: transform.named_sequence - -// WITH_OPTIONS-LABEL: func @int8_matmul -// WITH_OPTIONS-NOT: transform.sequence -// WITH_OPTIONS-NOT: transform.named_sequence - -// WITH_OPTIONS_2-LABEL: func @int8_matmul -// WITH_OPTIONS_2-NOT: transform.sequence -// WITH_OPTIONS_2-NOT: transform.named_sequence - -// WITH_OPTIONS_3-LABEL: func @int8_matmul -// WITH_OPTIONS_3-NOT: transform.sequence -// WITH_OPTIONS_3-NOT: transform.named_sequence diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir deleted file mode 100644 index 599ea923d988..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir +++ /dev/null @@ -1,150 +0,0 @@ -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true \ -// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \ -// RUN: | FileCheck %s - -// Check that setting the command line options affect the transform -// strategy as expected. -// RUN: iree-opt %s --split-input-file --iree-codegen-llvmgpu-enable-transform-dialect-jit=true \ -// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ -// RUN: --iree-gpu-test-target=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \ -// RUN: --td-pad-strategy-blk-sizes=16,32,1 \ -// RUN: --td-pad-strategy-num-threads=8,4,1 \ -// RUN: --td-pad-strategy-vector-size=2,4 \ -// RUN: --td-pad-strategy-use-async-copies=false \ -// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -func.func @pad() { - %c0 = arith.constant 0 : index - %c56 = arith.constant 56 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %cst_0 = arith.constant 0.000000e+00 : f32 - %padded = tensor.pad %2 low[%c0, 0] high[5, %c56] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst_0 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @pad -// CHECK: transform.named_sequence -// CHECK: transform.iree.register_match_callbacks -// CHECK: {{.*}} = transform.iree.match_callback failures(propagate) "pad"({{.*}}) : (!transform.any_op) -> !transform.any_op -// CHECK: transform.structured.tile_using_forall {{.*}} tile_sizes [64, 64](mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: {{.*}} = transform.structured.match ops{["scf.if"]} in {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.scf.take_assumed_branch {{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice {{.*}} : (!transform.any_op) -> () -// CHECK: {{.*}} = transform.structured.tile_using_forall {{.*}} num_threads [16, 16](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: {{.*}} = transform.structured.match ops{["scf.if"]} in {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.scf.take_assumed_branch {{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.structured.vectorize {{.*}} vector_sizes [4, 4] : !transform.any_op -// CHECK: {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.apply_patterns.vector.lower_masked_transfers -// CHECK: apply_patterns to %{{.*}} { -// CHECK-DAG: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface -// CHECK-DAG: transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices -// CHECK-DAG: transform.apply_patterns.vector.cast_away_vector_leading_one_dim -// CHECK: } : !transform.any_op -// CHECK: {{.*}} = transform.structured.vectorize_children_and_apply_patterns {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.canonicalization -// CHECK } -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to -// CHECK: transform.iree.eliminate_empty_tensors {{.*}} : (!transform.any_op) -> () -// CHECK: {{.*}} = transform.iree.bufferize {target_gpu} {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.memref.erase_dead_alloc_and_stores {{.*}} : (!transform.any_op) -> () -// CHECK: {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.iree.forall_to_workgroup {{.*}} : (!transform.any_op) -> () -// CHECK: transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] {{.*}}: (!transform.any_op) -> () -// CHECK: transform.apply_patterns.vector.lower_masks -// CHECK: transform.apply_patterns.vector.materialize_masks -// CHECK: apply_patterns to %{{.*}} { -// CHECK-DAG: transform.apply_patterns.linalg.tiling_canonicalization -// CHECK-DAG: transform.apply_patterns.memref.fold_memref_alias_ops -// CHECK-DAG: transform.apply_patterns.canonicalization -// CHECK: } : !transform.any_op -// CHECK: transform.iree.apply_licm -// CHECK: transform.apply_cse to - -// WITH_OPTIONS-LABEL: func @pad -// WITH_OPTIONS: transform.structured.tile_using_forall {{.*}} tile_sizes [32, 16](mapping = [#gpu.block, #gpu.block]) -// WITH_OPTIONS: {{.*}} = transform.structured.tile_using_forall {{.*}} num_threads [4, 8](mapping = [#gpu.thread, #gpu.thread]) -// WITH_OPTIONS: transform.structured.vectorize {{.*}} vector_sizes [2, 4] : !transform.any_op -// WITH_OPTIONS: transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [8, 4, 1] - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -func.func @pad_low() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %cst_0 = arith.constant 0.000000e+00 : f32 - %padded = tensor.pad %2 low[5, 0] high[0, 56] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst_0 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return -} - -// The strategy doesn't apply for low padding. -// CHECK-LABEL: @pad_low -// CHECK-NOT: transform.iree -// WITH_OPTIONS-LABEL: @pad_low -// WITH_OPTIONS-NOT: transform.iree - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding -]> -func.func @pad_local() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %padded = tensor.pad %2 low[0, 0] high[5, 56] { - ^bb0(%arg0: index, %arg1: index): - %3 = arith.index_cast %arg0 : index to i64 - %4 = arith.uitofp %3 : i64 to f32 - tensor.yield %4 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return -} - -// The strategy doesn't apply for local pad values. -// CHECK-LABEL: @pad_local -// CHECK-NOT: transform.iree -// WITH_OPTIONS-LABEL: @pad_local -// WITH_OPTIONS-NOT: transform.iree diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel index ccd545988581..d264a26551f9 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel @@ -92,7 +92,6 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect", "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect", "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface", - "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU", "//compiler/src/iree/compiler/Codegen/Transforms", "//compiler/src/iree/compiler/Codegen/Utils", "//compiler/src/iree/compiler/Dialect/Flow/IR", diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt index 7f4ccd972613..08ec5885dc97 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt @@ -141,7 +141,6 @@ iree_cc_library( iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface - iree::compiler::Codegen::TransformStrategies::GPU iree::compiler::Codegen::Transforms iree::compiler::Codegen::Utils iree::compiler::Dialect::Flow::IR diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp index 568365965cff..16a1acf4316f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp @@ -10,7 +10,6 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" #include "iree/compiler/Codegen/Utils/Utils.h" @@ -45,11 +44,6 @@ constexpr int kMaxVectorNumBits = 128; namespace mlir::iree_compiler { -llvm::cl::opt clSPIRVEnableTransformDialectJit( - "iree-spirv-enable-transform-dialect-jit", - llvm::cl::desc("Enable the usage of the transform dialect JIT"), - llvm::cl::init(false)); - using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline; //===----------------------------------------------------------------------===// @@ -1490,47 +1484,6 @@ static LogicalResult setDefaultOpConfig(IREE::GPU::TargetAttr target, workgroupSize); } -//===----------------------------------------------------------------------===// -// Transform Dialect Specialized Configurations -//===----------------------------------------------------------------------===// - -static LogicalResult -setTransformDialectConfig(mlir::FunctionOpInterface entryPoint, Operation *op, - IREE::GPU::TargetAttr target) { - if (!clSPIRVEnableTransformDialectJit) { - return failure(); - } - - MLIRContext *context = entryPoint.getContext(); - auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( - context, CodeGenPipeline::TransformDialectCodegen); - - // TODO: unify the target information into one structure. - iree_compiler::gpu::GPUModel gpuModel; - gpuModel.hasWarpShuffle = target.supportsSubgroupShuffle(); - gpuModel.hasTF32TensorCore = false; - gpuModel.hasMmaSync = false; - gpuModel.hasTF32TensorCore = false; - gpuModel.minSubgroupSize = target.getMinSubgroupSize(); - gpuModel.maxSubgroupSize = target.getMaxSubgroupSize(); - gpuModel.maxWorkGroupInvocations = - target.getWgp().getMaxThreadCountPerWorkgroup(); - - // Populates the supported WMMA fragment combinations from the target - // environment. Infer tf32 support from the list of supported fragment types. - for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { - auto [mSize, nSize, kSize] = mma.getMNKShape(); - auto [aType, bType, cType] = mma.getABCElementTypes(); - gpuModel.supportedWMMAConfigs.emplace_back(iree_compiler::gpu::MMAConfig{ - mSize, nSize, kSize, aType, bType, cType}); - } - - if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op, - gpuModel))) - return failure(); - return setTranslationInfo(entryPoint, translationInfo); -} - //===----------------------------------------------------------------------===// // Configuration Dispatcher //===----------------------------------------------------------------------===// @@ -1540,11 +1493,6 @@ setTransformDialectConfig(mlir::FunctionOpInterface entryPoint, Operation *op, static LogicalResult setSPIRVOpConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn, Operation *rootOp) { - // First try to see if there is a matching transform dialect configuration. - if (succeeded(setTransformDialectConfig(entryPointFn, rootOp, target))) { - return success(); - } - // First try to find a proper CodeGen configuration to tile and vectorize for // the current target architecture. if (target.isAMD() && succeeded(detail::setAMDCodeGenConfig(target, rootOp))) diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp index ef68e53ffe4f..219ec01dd8b6 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp @@ -9,16 +9,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" #include "iree/compiler/Codegen/SPIRV/KernelConfig.h" #include "iree/compiler/Codegen/SPIRV/Passes.h" -#include "iree/compiler/Dialect/HAL/IR/HALDialect.h" -#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" @@ -42,15 +33,8 @@ class SPIRVSelectLoweringStrategyPass final SPIRVSelectLoweringStrategyPass>::SPIRVSelectLoweringStrategyPassBase; void getDependentDialects(DialectRegistry ®istry) const override { - // TODO(qedawkins): Once TransformStrategies is deprecated, drop the - // unnecessary dialect registrations. - registry - .insert(); + registry.insert(); } void runOnOperation() override; diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel index 3886c6e20938..e4807d9310df 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel @@ -59,7 +59,6 @@ iree_lit_test_suite( "pipeline_reduction_subgroup.mlir", "pipeline_sub_byte_dequant.mlir", "physical_storage_buffer_addresses.mlir", - "set_transform_strategy.mlir", "tile_and_distribute.mlir", "tile_and_distribute_scatter.mlir", "tile_and_distribute_sort.mlir", diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt index 078f92ab37b3..f28a588339ca 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt @@ -55,7 +55,6 @@ iree_lit_test_suite( "pipeline_matvec.mlir" "pipeline_reduction_subgroup.mlir" "pipeline_sub_byte_dequant.mlir" - "set_transform_strategy.mlir" "tile_and_distribute.mlir" "tile_and_distribute_scatter.mlir" "tile_and_distribute_sort.mlir" diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir deleted file mode 100644 index d32855d538b7..000000000000 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir +++ /dev/null @@ -1,44 +0,0 @@ -// RUN: iree-opt %s --split-input-file --iree-gpu-test-target=volta@vulkan \ -// RUN: --pass-pipeline="builtin.module(iree-spirv-select-lowering-strategy-pass)"\ -// RUN: --iree-spirv-enable-transform-dialect-jit=true - -// TODO: Transform script based CodeGen expects fp32-input to target tensor -// core, but there are no such wmma intrinsics. Fix it to support fp16-input. -// TODO: | FileCheck %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -func.func @matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> - %5 = tensor.empty() : tensor<2052x2052xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK-LABEL: func @matmul - -// CHECK: transform.named_sequence - -/// The specific vector sizes are tested in the LLVMGPU tests and thus omitted -/// here. This is just to check that masked vectorization is used. -// CHECK-COUNT-3: transform.structured.vectorize - -// Verify use of WMMA. -// CHECK: apply_patterns to %{{.*}} { -// CHECK: transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync -// CHECK: } : !transform.any_op -// CHECK: transform.iree.vector.vector_to_mma_conversion %{{.*}} {use_wmma} - -// Verify asynchronous copy is not used. -// CHECK-NOT: transform.iree.create_async_groups diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel deleted file mode 100644 index 236a47446725..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright 2020 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -package( - default_visibility = ["//visibility:public"], - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt deleted file mode 100644 index d74a77855614..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# compiler/src/iree/compiler/Codegen/TransformStrategies/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel deleted file mode 100644 index bf6645762b0d..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2023 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library") - -package( - default_visibility = ["//visibility:public"], - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) - -iree_compiler_cc_library( - name = "CPU", - srcs = [ - "Common.cpp", - "ReductionStrategy.cpp", - ], - hdrs = [ - "Common.h", - "ReductionStrategy.h", - ], - deps = [ - # Dialects - "//compiler/src/iree/compiler/Dialect/Flow/IR", - "//compiler/src/iree/compiler/Dialect/LinalgExt/IR", - "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions", - "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect", - "@llvm-project//mlir:AffineDialect", - "@llvm-project//mlir:AffineUtils", - "@llvm-project//mlir:AsyncDialect", - "@llvm-project//mlir:ArithDialect", - "@llvm-project//mlir:ArithUtils", - "@llvm-project//mlir:BufferizationDialect", - "@llvm-project//mlir:BufferizationTransforms", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:FunctionInterfaces", - "@llvm-project//mlir:GPUDialect", - "@llvm-project//mlir:LinalgDialect", - "@llvm-project//mlir:LLVMDialect", - "@llvm-project//mlir:PDLDialect", - "@llvm-project//mlir:PDLInterpDialect", - "@llvm-project//mlir:SCFDialect", - "@llvm-project//mlir:SCFUtils", - "@llvm-project//mlir:TensorDialect", - "@llvm-project//mlir:TransformDialect", - "@llvm-project//mlir:VectorDialect", - # IR - "@llvm-project//mlir:Analysis", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Parser", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:Rewrite", - # Interfaces - # Transforms (needed mostly for the BufferizableOpInterfaceImpl) - "@llvm-project//mlir:ArithTransforms", - "@llvm-project//mlir:LinalgTransforms", - "@llvm-project//mlir:SCFTransforms", - "@llvm-project//mlir:TensorTransforms", - "@llvm-project//mlir:VectorTransforms", - # Other Stuff - "@llvm-project//llvm:Support", - "@llvm-project//mlir:Support", - "@llvm-project//mlir:DialectUtils", - # TransformStrategies - "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies", - # TransformExtensions - "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions", - "//compiler/src/iree/compiler/Codegen/LLVMCPU/TransformExtensions:LLVMCPUExtensions", - "@llvm-project//mlir:LinalgTransformOps", - "@llvm-project//mlir:VectorTransformOps", - # TransformMatchers and other stuff - "//llvm-external-projects/iree-dialects:IREEDialectsTransforms", - ], -) diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt deleted file mode 100644 index 06ac540e9d91..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -iree_cc_library( - NAME - CPU - HDRS - "Common.h" - "ReductionStrategy.h" - SRCS - "Common.cpp" - "ReductionStrategy.cpp" - DEPS - IREEDialectsTransforms - IREELinalgTransformDialect - LLVMSupport - MLIRAffineDialect - MLIRAffineUtils - MLIRAnalysis - MLIRArithDialect - MLIRArithTransforms - MLIRArithUtils - MLIRAsyncDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms - MLIRFuncDialect - MLIRFunctionInterfaces - MLIRGPUDialect - MLIRIR - MLIRLLVMDialect - MLIRLinalgDialect - MLIRLinalgTransformOps - MLIRLinalgTransforms - MLIRPDLDialect - MLIRPDLInterpDialect - MLIRParser - MLIRPass - MLIRRewrite - MLIRSCFDialect - MLIRSCFTransforms - MLIRSCFUtils - MLIRSupport - MLIRTensorDialect - MLIRTensorTransforms - MLIRTransformDialect - MLIRVectorDialect - MLIRVectorTransformOps - MLIRVectorTransforms - iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions - iree::compiler::Codegen::LLVMCPU::TransformExtensions::LLVMCPUExtensions - iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies - iree::compiler::Dialect::Flow::IR - iree::compiler::Dialect::LinalgExt::IR - iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions - PUBLIC -) - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp deleted file mode 100644 index 0b3d1fb33294..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMCPU/TransformExtensions/LLVMCPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Dialect/Flow/IR/FlowOps.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -// TODO: significantly better namespacing. -using iree_compiler::cpu::CPUModel; -using iree_compiler::cpu::ReductionConfig; -using iree_compiler::cpu::ReductionStrategy; -using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp; -using transform::ApplyLowerContractionPatternsOp; -using transform::ApplyLowerMultiReductionPatternsOp; -using transform::ApplyLowerShapeCastPatternsOp; -using transform::ApplyLowerTransferPatternsOp; -using transform::ApplyLowerTransposePatternsOp; -using transform::ApplySplitTransferFullPartialPatternsOp; -using transform::ApplyTransferPermutationPatternsOp; -using transform::ApplyTransferToScfPatternsOp; -using transform::MatchOp; -using transform::SplitHandleOp; -using transform_ext::AllDims; -using transform_ext::m_StructuredOp; -using transform_ext::NumEqualsTo; -using transform_ext::RegisterMatchCallbacksOp; -using transform_ext::ShapeKind; -using transform_ext::StructuredOpMatcher; -using vector::VectorContractLoweringAttr; - -//===----------------------------------------------------------------------===// -// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// - -// TODO: better builders. -static Value buildDefaultVectorLoweringStrategy( - ImplicitLocOpBuilder &b, Value funcH, - const vector::LowerVectorsOptions &lowerVectorsOpts) { - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create( - loc, lowerVectorsOpts.vectorContractLowering); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create( - loc, lowerVectorsOpts.vectorMultiReductionLowering); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create( - loc, lowerVectorsOpts.vectorTransferSplit); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create( - loc, /*maxTransferRank=*/1, - /*fullUnroll=*/lowerVectorsOpts.unrollVectorTransfers); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create(loc, - /*maxTransferRank=*/1); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create( - loc, /*loweringStrategy=*/lowerVectorsOpts.vectorTransposeLowering, - /*avx2LoweringStrategy=*/lowerVectorsOpts.transposeAVX2Lowering); - }); - return funcH; -} - -/// Take care of the last common steps in a CPU strategy (i.e. vectorize, -/// bufferize and map to blocks). -/// Return the handles to the updated variant and the function ops under -/// the variant op. -std::pair mlir::iree_compiler::cpu::buildCommonTrailingStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const vector::LowerVectorsOptions &lowerVectorsOpts) { - Value funcH = b.create(variantH, func::FuncOp::getOperationName()); - - // Step N-5. Fold tensor.empty to avoid large allocations. - // Step N-4. Perform a pass of canonicalization + enabling after tiling. - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - funcH = iree_compiler::buildVectorize(b, funcH); - - // Step N-3. Perform a pass of canonicalization + enabling after vectorization - // as well as hoisting subset operations such as vector.transfer_read/write. - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - iree_compiler::buildHoisting(b, funcH); - - // Step N-2. Bufferize and drop HAL descriptor from memref ops. - variantH = iree_compiler::buildBufferize(b, variantH); - - // Step N-1. Post-bufferization mapping to blocks only. - // Need to match again since bufferize invalidated all handles. - // TODO: assumes a single function to transform, may need hardening. - funcH = b.create(variantH, func::FuncOp::getOperationName()); - b.create(funcH); - - // Step N. Lower vectors. - funcH = buildDefaultVectorLoweringStrategy(b, funcH, lowerVectorsOpts); - return std::make_pair(variantH, funcH); -} - -//===----------------------------------------------------------------------===// -// Higher-level problem-specific strategy creation APIs, these should favor -// user-friendliness. -//===----------------------------------------------------------------------===// - -static ReductionConfig -getReductionConfig(const transform_ext::MatchedReductionCaptures &captures, - const CPUModel &cpuModel) { - return ReductionConfig{16}; -} - -LogicalResult iree_compiler::cpu::matchAndSetReductionStrategy( - mlir::FunctionOpInterface entryPoint, linalg::LinalgOp op, - const CPUModel &cpuModel) { - // 1. Match a reduction and surrounding ops. - StructuredOpMatcher *reduction; - transform_ext::MatchedReductionCaptures captures; - transform_ext::MatcherContext matcherContext; - makeReductionMatcher(matcherContext, reduction, captures, - /*mustMatchEntireFunc=*/true); - if (!matchPattern(op, *reduction)) - return failure(); - - // 2. Construct the configuration and the strategy builder. - // TODO: Generalize along the HW axis. - auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) { - ReductionConfig reductionConfig = getReductionConfig(captures, cpuModel); - ReductionStrategy strategy(captures, reductionConfig); - return buildReductionStrategy(b, variant, strategy); - }; - - // 3. Build strategy embedded into the IR. - createTransformRegion(entryPoint, strategyBuilder); - - return success(); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h deleted file mode 100644 index c663ad87b3f8..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/Common.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_ - -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/Interfaces/FunctionInterfaces.h" - -namespace mlir::iree_compiler::cpu { - -//===----------------------------------------------------------------------===// -// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// -/// Take care of the last common steps in a CPU strategy (i.e. vectorize, -/// bufferize, maps to blocks/workgroups and lower vectors). -/// Return the handles to the updated variant and the function ops under -/// the variant op. -// TODO: pass control to LowerVectorsOp once the builder allows it. -std::pair buildCommonTrailingStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const vector::LowerVectorsOptions &lowerVectorsOpts); - -//===----------------------------------------------------------------------===// -// Higher-level problem-specific strategy creation APIs, these should favor -// user-friendliness. -//===----------------------------------------------------------------------===// -/// Placeholder for some hardware model proxy that contains relevant information -/// to configure the reduction strategy. In the future, this will need to be -/// driven by some contract with the runtime. -struct CPUModel { - static constexpr StringLiteral kDefaultCPU = "DefaultCPU"; - StringRef model = kDefaultCPU; -}; - -/// Map an N-D parallel, 1-D reduction operation with optional leading and -/// optional trailing elementwise operations. -/// The 1-D reduction dimension must be in the most minor dimension. -/// The innermost dimensions of the leading and trailing operations must be most -/// minor along all accesses. -/// Return failure if matching fails. -/// On a successful match, configure a reduction strategy based on a proxy model -/// of the hardware and construct transform dialect IR that implements the -/// reduction strategy. The transform dialect IR is added in a top-level -/// ModuleOp after the `entryPoint` function. -LogicalResult matchAndSetReductionStrategy(mlir::FunctionOpInterface entryPoint, - linalg::LinalgOp op, - const CPUModel &cpuModel); - -} // namespace mlir::iree_compiler::cpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_COMMON_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp deleted file mode 100644 index f5998663f2fd..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMCPU/TransformExtensions/LLVMCPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/CPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Dialect/Flow/IR/FlowOps.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -// TODO: significantly better namespacing. -using iree_compiler::cpu::ReductionConfig; -using iree_compiler::cpu::ReductionStrategy; -using transform_ext::RegisterMatchCallbacksOp; - -mlir::iree_compiler::cpu::ReductionStrategy::ReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig) - : AbstractReductionStrategy(captures, {}) { - configure(reductionConfig); - LLVM_DEBUG(DBGS() << "use CPU reduction strategy\n"); -} - -void mlir::iree_compiler::cpu::ReductionStrategy::configure( - const ReductionConfig &config) { - // Block-level - // =========== - // Tile all the parallel dimensions to 8 for now. - int64_t numParallelLoops = captures.reductionRank - 1; - workgroupTileSizes.append(numParallelLoops, 8); - vectorSize = config.vectorSize; -} - -/// Builds the transform IR tiling reductions for CUDA targets. Supports -/// reductions in the last dimension, with optional leading and trailing -/// elementwise operations. -void mlir::iree_compiler::cpu::buildReductionStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const ReductionStrategy &strategy) { - // Step 1. Tiling to the block/workgroup level. Keep everything fused. - auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock, - forall] = - buildReductionStrategyBlockDistribution(b, variantH, - strategy.workgroupTileSizes); - - // Step 2. Naive first strategy to tile the most minor dimension by - // strategy.getVectorSize(). - for (auto [val, rank] : SmallVector>{ - {maybeLeadingHBlock, strategy.captures.maybeLeadingRank}, - {gridReductionH, strategy.captures.reductionRank}, - {maybeTiledTrailingHBlock, strategy.captures.maybeTrailingRank}}) { - if (rank == 0) - continue; - SmallVector tileSizes(rank - 1, 0); - tileSizes.push_back(strategy.getVectorSize()); - buildTileFuseToScfFor(b, variantH, val, {}, - getAsOpFoldResult(b.getI64ArrayAttr(tileSizes))); - } - - // Step 3-5. Common trailing steps. - vector::LowerVectorsOptions lowerVectorsOptions; - lowerVectorsOptions - .setVectorTransformsOptions(vector::VectorContractLowering::OuterProduct) - .setVectorMultiReductionLowering( - vector::VectorMultiReductionLowering::InnerParallel) - .setVectorTransferSplit(vector::VectorTransferSplit::LinalgCopy) - .setVectorTransposeLowering(vector::VectorTransposeLowering::EltWise) - .setTransposeAVX2Lowering(false) - .setUnrollVectorTransfers(true); - buildCommonTrailingStrategy(b, variantH, lowerVectorsOptions); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h deleted file mode 100644 index 282f3ced55b9..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/CPU/ReductionStrategy.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_ - -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" - -namespace mlir::iree_compiler::cpu { - -struct CPUModel; - -/// Structure to hold a summary of HW-derived properties to configure the -/// reduction strategy. -/// The objective of this struct is to act as a minimal summary of key -/// properties derived from the hardware (e.g. by an oracle) and that are -/// sufficient to steer the strategy to produce a good version. -/// These can be thought of as latent variables or embeddings that directly -/// control the strategy and can be derived from the hardware by some procedure. -struct ReductionConfig { - int64_t vectorSize; -}; - -/// A simple CPU ReductionStrategy. -class ReductionStrategy : public iree_compiler::AbstractReductionStrategy { -public: - ReductionStrategy(const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig); - - ReductionStrategy(const ReductionStrategy &) = default; - ReductionStrategy &operator=(const ReductionStrategy &) = default; - - int64_t getVectorSize() const { return vectorSize; } - -private: - /// Compute the small strategy based on the problem size. - void configure(const ReductionConfig &config); - - /// Vector size. - int64_t vectorSize; -}; - -/// Entry point to build the transform IR corresponding to a reduction strategy. -/// This is used to map an N-D parallel, 1-D reduction operation with optional -/// leading and optional trailing elementwise operations. -/// The 1-D reduction dimension must be in the most minor dimension. -/// The innermost dimensions of the leading and trailing operations must be most -/// minor along all accesses. -void buildReductionStrategy(ImplicitLocOpBuilder &b, Value variantH, - const ReductionStrategy &strategy); - -} // namespace mlir::iree_compiler::cpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_CPU_REDUCTION_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h deleted file mode 100644 index d89ffeed06ea..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_ - -#include "iree-dialects/Transforms/TransformMatchers.h" - -namespace mlir::iree_compiler { - -/// Structure to hold the parameters that control the reduction strategy. -struct AbstractReductionStrategy { - AbstractReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - ArrayRef workgroupTileSizes) - : captures(captures), workgroupTileSizes(workgroupTileSizes) {} - - /// Constructor quantities. - transform_ext::MatchedReductionCaptures captures; - - /// Tile sizes for the workgroup / determines grid size for all known - /// reduction strategies. - SmallVector workgroupTileSizes; -}; - -} // namespace mlir::iree_compiler - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_ABSTRACT_REDUCTION_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel deleted file mode 100644 index 6771d9269026..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2023 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library") - -package( - default_visibility = ["//visibility:public"], - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) - -iree_compiler_cc_library( - name = "TransformStrategies", - srcs = [ - "Common.cpp", - ], - hdrs = [ - "AbstractReductionStrategy.h", - "Common.h", - ], - deps = [ - # Dialects - "//compiler/src/iree/compiler/Dialect/Flow/IR", - "//compiler/src/iree/compiler/Dialect/LinalgExt/IR", - "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions", - "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect", - "@llvm-project//mlir:AffineDialect", - "@llvm-project//mlir:AffineUtils", - "@llvm-project//mlir:AsyncDialect", - "@llvm-project//mlir:ArithDialect", - "@llvm-project//mlir:ArithUtils", - "@llvm-project//mlir:BufferizationDialect", - "@llvm-project//mlir:BufferizationTransforms", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:FunctionInterfaces", - "@llvm-project//mlir:GPUDialect", - "@llvm-project//mlir:LinalgDialect", - "@llvm-project//mlir:LLVMDialect", - "@llvm-project//mlir:PDLDialect", - "@llvm-project//mlir:PDLInterpDialect", - "@llvm-project//mlir:SCFDialect", - "@llvm-project//mlir:SCFUtils", - "@llvm-project//mlir:TensorDialect", - "@llvm-project//mlir:TransformDialect", - "@llvm-project//mlir:VectorDialect", - # IR - "@llvm-project//mlir:Analysis", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Parser", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:Rewrite", - # Interfaces - # Transforms (needed mostly for the BufferizableOpInterfaceImpl) - "@llvm-project//mlir:ArithTransforms", - "@llvm-project//mlir:LinalgTransforms", - "@llvm-project//mlir:MemRefTransformOps", - "@llvm-project//mlir:SCFTransforms", - "@llvm-project//mlir:SCFTransformOps", - "@llvm-project//mlir:TensorTransforms", - "@llvm-project//mlir:TensorTransformOps", - "@llvm-project//mlir:TransformLoopExtension", - "@llvm-project//mlir:VectorTransforms", - "@llvm-project//mlir:VectorTransformOps", - # Other Stuff - "@llvm-project//llvm:Support", - "@llvm-project//mlir:Support", - "@llvm-project//mlir:DialectUtils", - # TransformExtensions - "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions", - # TransformMatchers and other stuff - "//llvm-external-projects/iree-dialects:IREEDialectsTransforms", - ], -) diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt deleted file mode 100644 index 0198dab69c14..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# compiler/src/iree/compiler/Codegen/TransformStrategies/Common/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -iree_cc_library( - NAME - TransformStrategies - HDRS - "AbstractReductionStrategy.h" - "Common.h" - SRCS - "Common.cpp" - DEPS - IREEDialectsTransforms - IREELinalgTransformDialect - LLVMSupport - MLIRAffineDialect - MLIRAffineUtils - MLIRAnalysis - MLIRArithDialect - MLIRArithTransforms - MLIRArithUtils - MLIRAsyncDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms - MLIRFuncDialect - MLIRFunctionInterfaces - MLIRGPUDialect - MLIRIR - MLIRLLVMDialect - MLIRLinalgDialect - MLIRLinalgTransforms - MLIRMemRefTransformOps - MLIRPDLDialect - MLIRPDLInterpDialect - MLIRParser - MLIRPass - MLIRRewrite - MLIRSCFDialect - MLIRSCFTransformOps - MLIRSCFTransforms - MLIRSCFUtils - MLIRSupport - MLIRTensorDialect - MLIRTensorTransformOps - MLIRTensorTransforms - MLIRTransformDialect - MLIRTransformLoopExtension - MLIRVectorDialect - MLIRVectorTransformOps - MLIRVectorTransforms - iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions - iree::compiler::Dialect::Flow::IR - iree::compiler::Dialect::LinalgExt::IR - iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions - PUBLIC -) - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp deleted file mode 100644 index 75d492f859de..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp +++ /dev/null @@ -1,485 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" -#include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") - -// TODO: significantly better namespacing. -using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp; -using iree_compiler::IREE::transform_dialect::IREEBufferizeOp; -using iree_compiler::IREE::transform_dialect::IREEEliminateEmptyTensorsOp; -using iree_compiler::IREE::transform_dialect:: - PopulateWorkgroupCountRegionUsingNumThreadsSliceOp; -using transform::FuseIntoContainingOp; -using transform::HoistLoopInvariantSubsetsOp; -using transform::MatchOp; -using transform::MemRefEraseDeadAllocAndStoresOp; -using transform::MergeHandlesOp; -using transform::NamedSequenceOp; -using transform::PrintOp; -using transform::SplitHandleOp; -using transform::SplitReductionOp; -using transform::TileUsingForallOp; -using transform::VectorizeChildrenAndApplyPatternsOp; -using transform_ext::RegisterMatchCallbacksOp; -using transform_ext::TakeFirstOp; - -/// Matches `args` within `targetH` and unpacks a number of handles `N`. -/// Assumes there are exactly `N` matched ops (but could be relaxed). -/// Returns the tuple of handles. -template -auto matchAndUnpack(ImplicitLocOpBuilder &b, Value targetH, - MatchingArgs... args) { - Value matchedH = b.create(targetH, args...); - auto matchOp = b.create(matchedH, - /*numHandles=*/N); - assert(matchOp->getNumResults() == N && "Unexpected number of results"); - std::array a; - for (int64_t i = 0; i < N; ++i) - a[i] = matchOp->getResult(i); - return std::tuple_cat(a); -} - -int64_t mlir::iree_compiler::previousMultipleOf(int64_t val, int64_t multiple) { - assert(val > 0 && "expected nonnegative val"); - assert(multiple > 0 && "expected nonnegative multiple"); - return (val / multiple) * multiple; -} - -int64_t mlir::iree_compiler::nextMultipleOf(int64_t val, int64_t multiple) { - assert(val > 0 && "expected nonnegative val"); - assert(multiple > 0 && "expected nonnegative multiple"); - return ((val + multiple - 1) / multiple) * multiple; -} - -FailureOr -mlir::iree_compiler::maxDivisorOfValueBelowLimit(int64_t value, int64_t limit) { - // Conservatively return failure when `limit` is greater than 1024 to avoid - // prohibitively long compile time overheads. - // TODO: approximate with a faster implementation based on a few desirable - // primes. - if (limit > 1024) - return failure(); - // If either value or limit is <= 0, the loop is skipped and we fail. - for (int64_t i = std::min(value, limit); i > 1; --i) - if (value % i == 0) - return i; - return failure(); -} - -void mlir::iree_compiler::createTransformRegion( - mlir::FunctionOpInterface entryPoint, StrategyBuilderFn buildStrategy) { - MLIRContext *ctx = entryPoint.getContext(); - Location loc = entryPoint.getLoc(); - OpBuilder b(ctx); - b.setInsertionPointAfter(entryPoint); - auto topLevelTransformModule = b.create(loc); - topLevelTransformModule->setAttr( - transform::TransformDialect::kWithNamedSequenceAttrName, b.getUnitAttr()); - Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion(); - b.setInsertionPointToStart(&topLevelTransformRegion.front()); - auto anyOpType = transform::AnyOpType::get(b.getContext()); - auto sequence = b.create( - loc, - /*symName=*/ - std::string( - transform::TransformDialect::kTransformEntryPointSymbolName.str()), - /*rootType*/ anyOpType, - /*resultTypes=*/TypeRange{}, - /*bodyBuilder=*/[&](OpBuilder &b, Location loc, Value variantH) { - ImplicitLocOpBuilder ib(loc, b); - buildStrategy(ib, variantH); - b.create(loc); - }); - (void)sequence; - - LDBG("transformation script:\n"); - LDBG("verification: " << sequence.verify().succeeded() << "\n"); -} - -//===----------------------------------------------------------------------===// -// Low-level reusable builder APIs, these should follow MLIR-style builders. -//===----------------------------------------------------------------------===// - -/// Prints `handles` in order. Prints the whole IR if `handles` is empty. -void mlir::iree_compiler::buildPrint(ImplicitLocOpBuilder &b, - ValueRange handles) { - if (handles.empty()) - b.create(); - for (auto h : handles) - b.create(h); -} - -/// Create an ApplyPatternsOp that performs a set of key canonicalizations and -/// so-called enabling transformations to normalize the IR. -/// In addition to the specified transform, perform the following ones: -/// tiling-related canonicalization patterns, canonicalization, licm and cse -/// (in this order). -void mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - ImplicitLocOpBuilder &b, Value funcH, - ApplyPatternsOpBodyBuilderFn populatePatternsFn) { - b.create(funcH, [&](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - b.create(loc); - b.create(loc); - if (populatePatternsFn) - populatePatternsFn(b, loc); - }); - b.create( - funcH); - b.create(funcH); -} - -/// Dynamically selects the first non-empty handle; i.e. if (h1, h2) is: -/// - (non-empty, non-empty), returns (h1, h2) -/// - (empty, non-empty), returns (h2, empty) -/// - (non-empty, empty), returns (h1, empty) -/// - (empty, empty), returns (empty, empty) -/// This is used as a normalization operation that replaces conditionals, either -/// in C++ or in transform IR. -/// This can be thought of as a control-flow -> data-dependent conversion. -std::pair -mlir::iree_compiler::buildSelectFirstNonEmpty(ImplicitLocOpBuilder &b, - Value handle1, Value handle2) { - auto anyOpType = transform::AnyOpType::get(b.getContext()); - auto selector = b.create(anyOpType, anyOpType, - ArrayRef{handle1, handle2}); - return std::make_pair(selector.getFirst(), selector.getRest()); -} - -mlir::iree_compiler::TileToScfForAndFuseResult -mlir::iree_compiler::buildTileFuseToScfFor(ImplicitLocOpBuilder &b, - Value variantH, Value rootH, - ValueRange opsHToFuse, - ArrayRef tileSizes, - bool canonicalize) { - assert(opsHToFuse.empty() && "No fusion supported yet"); - iree_compiler::TileToScfForAndFuseResult result; - auto tiletoScfForOp = b.create(rootH, tileSizes); - result.forLoops = tiletoScfForOp.getLoops(); - result.tiledOpH = tiletoScfForOp.getTiledLinalgOp(); - - // Perform a pass of canonicalization + enabling after tiling. Currently this - // folds away the extract slice on the iterator, breaking padding on aligned - // matmuls. - // TODO: Make padding less brittle so that this toggle is unnecessary. - if (canonicalize) { - Value funcH = b.create( - variantH, func::FuncOp::getOperationName()); - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - } - return result; -} - -/// Performs the following transformations: -/// 1. Tiles `rootH` to scf.forall to with `tileSizesOrNumThreads` -/// according to whether spec is a TileSizesSpec or a NumThreadsSpec. -/// 2. Maps the resulting scf.forall to threads according to -/// `threadDimMapping`. -/// 3. Iterates over `opsHToFuse` in order and fuses into the containing op. -/// Returns a handle to the resulting scf.forall. -/// -/// Fusion operates in batch mode: a single fusion command is issued and a -/// topological sort is automatically computed by the fusion. -/// Since this applies a single fusion, no interleaved canonicalization / cse / -/// enabling transformation occurs and the resulting fusion may not be as good. -/// -/// In the future, an iterative mode in which the user is responsible for -/// providing the fusion order and has interleaved canonicalization / cse / -/// enabling transform will be introduced and may result in better fusions. -/// -/// If `resultingFusedOpsHandles` is a non-null pointer, the fused operation are -/// appended in order. -// TODO: apply forwarding pattern. -template -static iree_compiler::TileToForallAndFuseAndDistributeResult -buildTileAndFuseAndDistributeImpl(ImplicitLocOpBuilder &b, Value variantH, - Value rootH, ValueRange opsHToFuse, - ArrayRef tileSizesOrNumThreads, - ArrayAttr threadDimMapping) { - iree_compiler::TileToForallAndFuseAndDistributeResult result; - auto tileToForeachOp = b.create( - rootH, tileSizesOrNumThreads, TileOrNumThreadSpec(), threadDimMapping); - - result.forallH = tileToForeachOp.getForallOp(); - result.tiledOpH = tileToForeachOp.getTiledOp(); - - // Perform a pass of canonicalization + enabling after tiling. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - - // Batch fusion if requested. - if (opsHToFuse.size() > 1) { - Value mergedOpsH = - b.create(opsHToFuse, /*deduplicate=*/true); - b.create(mergedOpsH, result.forallH).getFusedOp(); - } else if (opsHToFuse.size() == 1) { - Value fusedH = - b.create(opsHToFuse.front(), result.forallH) - .getFusedOp(); - result.resultingFusedOpsHandles.push_back(fusedH); - } - return result; -} - -// TODO: if someone knows how to properly export templates go for it .. -// sigh. -iree_compiler::TileToForallAndFuseAndDistributeResult -mlir::iree_compiler::buildTileFuseDistToForallWithTileSizes( - ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse, - ArrayRef tileSizes, ArrayAttr threadDimMapping) { - return buildTileAndFuseAndDistributeImpl( - b, variantH, rootH, opsHToFuse, tileSizes, threadDimMapping); -} - -/// Call buildTileAndFuseAndDistributeImpl with ArrayRef numThreads. -// TODO: if someone knows how to properly export templates go for it .. -// sigh. -iree_compiler::TileToForallAndFuseAndDistributeResult -mlir::iree_compiler::buildTileFuseDistToForallWithNumThreads( - ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse, - ArrayRef numThreads, ArrayAttr threadDimMapping) { - return buildTileAndFuseAndDistributeImpl( - b, variantH, rootH, opsHToFuse, numThreads, threadDimMapping); -} - -/// Build the transform IR to pad an op `opH`. -// TODO: Better upstream builder. -Value mlir::iree_compiler::buildPad( - ImplicitLocOpBuilder &b, Value opH, ArrayRef paddingValues, - ArrayRef paddingDimensions, ArrayRef packingDimensions, - ArrayRef> transposePaddings) { - SmallVector staticPadToMultipleOf(paddingDimensions.size(), 1); - SmallVector transposeAttrs; - for (auto &transp : transposePaddings) - transposeAttrs.push_back(b.getI64ArrayAttr(transp)); - - Type resultTypes[] = {opH.getType(), - transform::AnyOpType::get(b.getContext()), - transform::AnyOpType::get(b.getContext())}; - return b - .create( - resultTypes, opH, b.getArrayAttr(paddingValues), - b.getI64ArrayAttr(paddingDimensions), - /*padToMultipleOf=*/ValueRange{}, staticPadToMultipleOf, - b.getI64ArrayAttr(packingDimensions), b.getArrayAttr(transposeAttrs), - /*copyBack=*/b.getStringAttr("none")) - ->getResult(0); -} - -/// Apply patterns and vectorize. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -// TODO: configure patterns. -Value mlir::iree_compiler::buildVectorize(ImplicitLocOpBuilder &b, Value funcH, - bool applyCleanups, - bool vectorizePadding, - bool vectorizeNdExtract) { - funcH = b.create(funcH, vectorizePadding, - vectorizeNdExtract); - if (applyCleanups) { - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - } - return funcH; -} - -void mlir::iree_compiler::buildLowerMaskedTransfersAndCleanup( - ImplicitLocOpBuilder &b, Value funcH, bool cleanup) { - // TODO: avoid functional style transform so we can apply to the variant. - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - if (cleanup) { - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - b.create(loc); - }); - } -} - -Value mlir::iree_compiler::buildLowerVectorMasksAndCleanup( - ImplicitLocOpBuilder &b, Value funcH, bool cleanup) { - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - if (cleanup) { - iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - } - return funcH; -} - -/// Hoist redundant subet ops. -void mlir::iree_compiler::buildHoisting(ImplicitLocOpBuilder &b, Value funcH) { - Value loops = - b.create(funcH, scf::ForOp::getOperationName()); - b.create(loops); -} - -/// Bufferize and drop HAL descriptor from memref ops. -Value mlir::iree_compiler::buildBufferize(ImplicitLocOpBuilder &b, - Value variantH, bool targetGpu) { - // Perform a pass of canonicalization + enabling before bufferization to avoid - // spurious allocations. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - }); - b.create(funcH); - variantH = b.create(funcH, targetGpu); - return variantH; -} - -namespace { -/// Various handles produced by reduction splitting. -struct ReductionSplitResult { - /// Handle to the leading elementwise operation, may be null if no such - /// operation is present. - Value leadingEltwiseH; - /// Handle to the fill operation feeding the init of a higher-rank - /// more-parallel reduction. - Value splitFillH; - /// Handle to the higher-rank more-parallel reduction. - Value splitLinalgH; - /// Handle to the final reduction. - Value combinerH; - /// Handle to the original fill operation, may be null if the operation - /// was not re-matched. - Value originalFillH; - /// Handle to the trailing fill operation, may be null if the operation - /// was not re-matched. - Value trailingEltwiseH; -}; -} // namespace - -/// Build transform IR to split the reduction into a parallel and combiner part. -/// Then tile the parallel part and map it to `tileSize` threads, each reducing -/// on `vectorSize` elements. -/// Lastly, fuse the newly created fill and elementwise operations into the -/// resulting containing forall op. -/// Return a triple of handles to (forall, fill, combiner) -std::tuple -mlir::iree_compiler::buildTileReductionUsingScfForeach( - ImplicitLocOpBuilder &b, Value isolatedParentOpH, Value reductionH, - int64_t reductionRank, int64_t tileSize, int64_t reductionVectorSize, - Attribute mappingAttr) { - SmallVector leadingParallelDims(reductionRank - 1, 0); - SmallVector numThreads = leadingParallelDims; - numThreads.push_back(tileSize); - SmallVector tileSizes = leadingParallelDims; - tileSizes.push_back(reductionVectorSize); - auto tileReduction = b.create( - /*target=*/reductionH, - /*numThreads=*/numThreads, - /*tileSizes=*/tileSizes, - /*threadDimMapping=*/b.getArrayAttr(mappingAttr)); - Value blockParallelForallOp = tileReduction.getForallOp(); - Value blockParallelFillH = tileReduction.getFillOp().front(); - Value blockCombinerOpH = tileReduction.getCombiningLinalgOp(); - // Fuse the fill and elementwise to privatize them. - blockParallelFillH = - b.create(blockParallelFillH, blockParallelForallOp) - .getFusedOp(); - return std::make_tuple(blockParallelForallOp, blockParallelFillH, - blockCombinerOpH); -} - -std::tuple -mlir::iree_compiler::buildReductionStrategyBlockDistribution( - ImplicitLocOpBuilder &b, Value variantH, - ArrayRef workgroupTileSizes) { - // Step 1. Call the matcher. Note that this is the same matcher as used to - // trigger this compilation path, so it must always apply. - b.create(); - auto [maybeLeadingH, fillH, reductionH, maybeTrailingH] = - unpackRegisteredMatchCallback<4>( - b, "reduction", transform::FailurePropagationMode::Propagate, - variantH); - // Step 2. Create the block/mapping tiling level and fusee. - auto [fusionTargetH, fusionGroupH] = - buildSelectFirstNonEmpty(b, maybeTrailingH, reductionH); - MLIRContext *ctx = b.getContext(); - SmallVector blockDimMapping{blockX(ctx), blockY(ctx), blockZ(ctx)}; - blockDimMapping.resize(workgroupTileSizes.size()); - TileToForallAndFuseAndDistributeResult tileResult = - buildTileFuseDistToForallWithTileSizes( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/fusionTargetH, - /*opsToFuseH=*/fusionGroupH, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr(workgroupTileSizes)), - /*threadDimMapping=*/b.getArrayAttr(blockDimMapping)); - - // Handle the workgroup count region. - b.create( - tileResult.forallH); - - fillH = - b.create(fillH, tileResult.forallH).getFusedOp(); - maybeLeadingH = - b.create(maybeLeadingH, tileResult.forallH) - .getFusedOp(); - - // Perform a pass of canonicalization + enabling after fusion. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - - // Step 3. Normalize to reorder results irrespective of emptiness. - auto [blockReductionH, maybeBlockTrailingH] = buildSelectFirstNonEmpty( - b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH); - return std::make_tuple(maybeLeadingH, fillH, blockReductionH, - maybeBlockTrailingH, tileResult.forallH); -} - -Value mlir::iree_compiler::buildMemoryOptimizations(ImplicitLocOpBuilder &b, - Value funcH) { - // Apply canonicalizations and enablings twice as they enable each other. - for (int i = 0; i < 2; ++i) { - buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - }); - } - b.create(funcH); - return funcH; -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h deleted file mode 100644 index 482ee2257680..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_ - -#include "mlir/Interfaces/FunctionInterfaces.h" -// Needed until IREE builds its own gpu::GPUBlockMappingAttr / gpu::Blocks -// attributes that are reusable across all targets. -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/IR/BuiltinOps.h" - -namespace mlir::iree_compiler { - -//===----------------------------------------------------------------------===// -// Base quantities generally useful for all CPU and GPU strategies. -//===----------------------------------------------------------------------===// -inline Attribute blockX(MLIRContext *ctx) { - return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimX); -} -inline Attribute blockY(MLIRContext *ctx) { - return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimY); -} -inline Attribute blockZ(MLIRContext *ctx) { - return mlir::gpu::GPUBlockMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ); -} - -struct AbstractReductionStrategy; - -//===----------------------------------------------------------------------===// -// General helpers. -//===----------------------------------------------------------------------===// - -/// Return the greatest value smaller or equal to `val` that is a multiple -/// of `multiple`. Asserts that all quantities are nonnegative. I.e. returns -/// `(val / multiple) * multiple` a.k.a `floordiv(val, multiple) * multiple`. -int64_t previousMultipleOf(int64_t val, int64_t multiple); - -/// Return the smallest value greater or equal to `val` that is a multiple of -/// `multiple`. Asserts that all quantities are nonnegative. -/// I.e. returns `((val + multiple - 1) / multiple) * multiple` a.k.a -/// a.k.a `ceildiv(val, multiple) * multiple`. -int64_t nextMultipleOf(int64_t val, int64_t multiple); - -/// Find the highest divisor of `value` that is smaller than `limit`. This is -/// useful to capture any tiling that is guaranteed to keep the IR static. -/// Conservatively return failure when `limit` is greater than 1024 to avoid -/// prohibitively long compile time overheads. -// TODO: approximate with a faster implementation based on a few desirable -// primes. -FailureOr maxDivisorOfValueBelowLimit(int64_t value, int64_t limit); - -using StrategyBuilderFn = std::function; - -/// Use `buildStrategy` to build a ModuleOp containing transform dialect IR, -/// right after function `entryPoint`. -/// This embed the transform into the IR and allows applying it either in debug -/// mode or within the IREE pipeline. -void createTransformRegion(mlir::FunctionOpInterface entryPoint, - StrategyBuilderFn buildStrategy); - -//===----------------------------------------------------------------------===// -// Low-level reusable builder APIs, these should follow MLIR-style builders. -//===----------------------------------------------------------------------===// - -/// Build transform IR that prints `handles` in order, or print the whole IR if -/// `handles` is empty. -void buildPrint(ImplicitLocOpBuilder &b, ValueRange handles = {}); - -using ApplyPatternsOpBodyBuilderFn = std::function; - -/// Create an ApplyPatternsOp that performs a set of key canonicalizations and -/// so-called enabling transformations to normalize the IR. -/// In addition to the specified transform, perform the following ones: -/// canonicalization, tiling_canonicalization, licm and cse (in this order). -void buildCanonicalizationAndEnablingTransforms( - ImplicitLocOpBuilder &b, Value funcH, - ApplyPatternsOpBodyBuilderFn populatePatternsFn = nullptr); - -/// Build transform IR to dynamically selects the first non-empty handle; i.e. -/// if (h1, h2) is: -/// - (non-empty, non-empty), returns (h1, h2) -/// - (empty, non-empty), returns (h2, empty) -/// - (non-empty, empty), returns (h1, empty) -/// - (empty, empty), returns (empty, empty) -/// This is used as a normalization operation that replaces conditionals, either -/// in C++ or in transform IR. -/// This can be thought of as a control-flow -> data-dependent conversion. -std::pair buildSelectFirstNonEmpty(ImplicitLocOpBuilder &b, - Value handle1, Value handle2); - -/// Result of the combined transform performing tiling, fusion and -/// distribution to parallel constructs. -struct TileToScfForAndFuseResult { - /// Vector of `scf.for` loops containing the tiled and fused operations. - SmallVector forLoops; - /// Handles to fused operations other than the final consumer operation. May - /// be empty if fusion was not performed iteratively. - /// This is currently empty - // TODO: support returning handles from `fuse_into_containing_op` and remove - // the restriction above. - SmallVector resultingFusedOpsHandles; - /// Handle to the tiled final consumer operation. - Value tiledOpH; -}; - -/// Build transform IR to perform multi-level tile and fuse into an scf.for op. -/// Note: fusion is currently unsupported. -TileToScfForAndFuseResult -buildTileFuseToScfFor(ImplicitLocOpBuilder &b, Value variantH, Value rootH, - ValueRange opsHToFuse, ArrayRef tileSizes, - bool canonicalize = true); - -/// Result of the combined transform performing tiling, fusion and -/// distribution to parallel constructs. -struct TileToForallAndFuseAndDistributeResult { - /// Outer `scf.forall` loop containing the tiled and fused - /// operations. - Value forallH; - /// Handles to fused operations other than the final consumer operation. May - /// be empty if fusion was not performed iteratively. - // TODO: support returning handles from `fuse_into_containing_op` and remove - // the restriction above. - SmallVector resultingFusedOpsHandles; - /// Handle to the tiled final consumer operation. - Value tiledOpH; -}; - -/// Build transform IR to perform the following transformations: -/// 1. Tiles `rootH` to scf.forall to with `tileSizesOrNumThreads` -/// according to whether spec is a TileSizesSpec or a NumThreadsSpec. -/// 2. Maps the resulting scf.forall to threads according to -/// `threadDimMapping`. -/// 3. Iterates over `opsHToFuse` in order and fuses into the containing op. -/// -/// Fusion operates in batch mode: a single fusion command is issued and a -/// topological sort is automatically computed by the fusion. -/// Since this applies a single fusion, no interleaved canonicalization / cse -/// / enabling transformation occurs and the resulting fusion may not be as -/// good. -/// -/// In the future, an iterative mode in which the user is responsible for -/// providing the fusion order and has interleaved canonicalization / cse / -/// enabling transform will be introduced and may result in better fusions. -/// -/// Note: this version cannot be used for the block-level tiling in a dispatch -/// region. `buildTileFuseDistToForallAndWorkgroupCountWithTileSizes` is -/// the modified version that is aware of the `workgroup_count` region. -/// -// TODO: if someone knows how to properly export templates go for it .. sigh. -TileToForallAndFuseAndDistributeResult buildTileFuseDistToForallWithTileSizes( - ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse, - ArrayRef tileSizes, ArrayAttr threadDimMapping); - -/// Similar to `buildTileFuseDistWithTileSizes` but using `numThreads` instead -/// of `tileSizes`. -TileToForallAndFuseAndDistributeResult buildTileFuseDistToForallWithNumThreads( - ImplicitLocOpBuilder &b, Value variantH, Value rootH, ValueRange opsHToFuse, - ArrayRef numThreads, ArrayAttr threadDimMapping); - -/// Build transform IR to split the reduction into a parallel and combiner part. -/// Then tile the parallel part and map it to `tileSize` threads, each reducing -/// on `vectorSize` elements. -/// Lastly, fuse the newly created fill and elementwise operations into the -/// resulting containing forall op. -/// Return a triple of handles to (forall, fill, combiner) -std::tuple buildTileReductionUsingScfForeach( - ImplicitLocOpBuilder &b, Value isolatedParentOpH, Value reductionH, - int64_t reductionRank, int64_t tileSize, int64_t reductionVectorSize, - Attribute mappingAttr); - -/// Build the transform IR to pad an op `opH`. -// TODO: Better upstream builder. -Value buildPad(ImplicitLocOpBuilder &b, Value opH, - ArrayRef paddingValues, - ArrayRef paddingDimensions, - ArrayRef packingDimensions, - ArrayRef> transposePaddings = {}); - -/// Build transform IR that applies rank-reduction patterns and vectorizes. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -/// If `applyCleanups` is true, also apply cleanup patterns. -Value buildVectorize(ImplicitLocOpBuilder &b, Value funcH, - bool applyCleanups = false, bool vectorizePadding = false, - bool vectorizeNdExtract = false); - -/// Build transform IR that applies lowering of masked vector transfer -/// operations and subsequent cleanup patterns (fold-memref-aliases). -/// Takes a handle to a containing op and returns an updated handle to the -/// containing op. -void buildLowerMaskedTransfersAndCleanup(ImplicitLocOpBuilder &b, Value funcH, - bool cleanup = true); - -/// Build transform IR that applies vector mask lowering and subsequent cleanup -/// patterns (fold-memref-aliases). -/// Takes a handle to a containing op and returns an updated handle to the -/// containing op. -Value buildLowerVectorMasksAndCleanup(ImplicitLocOpBuilder &b, Value funcH, - bool cleanup = true); - -/// Build transform IR to hoist redundant subset operations. -void buildHoisting(ImplicitLocOpBuilder &b, Value funcH); - -/// Build transform IR to bufferize and drop HAL descriptor from memref ops. -/// Takes a handle variantOp and returns a handle to the same variant op. -Value buildBufferize(ImplicitLocOpBuilder &b, Value variantH, - bool targetGpu = false); - -//===----------------------------------------------------------------------===// -// Higher-level problem-specific strategy creation APIs, these should favor -// user-friendliness. -//===----------------------------------------------------------------------===// - -/// Build transform IR to match exactly an N-D reduction operation (with -/// optional leading and trailing elementwise) and create a top-level -/// `scf.forall` tiled by `strategy.workgroupTileSizes`. -/// The matched `maybeLeadingH`, `fillH`, `reductionH` and `maybeTrailingH` are -/// fused into the top-level `scf.forall` and handles are returned to -/// the fused versions of these ops, in order, that are all tiled and -/// distributed accordingly. The scf.forall is returned as the last -/// value. -/// The mapping of the `scf.forall` dimensions is tied the first -/// dimensions of `strategy.allBlockAttrs`. -/// -/// Note: `buildTileFuseDistToForallAndWorkgroupCountWithTileSizes` is -/// called internally, this version is only for the block-level tiling inside a -/// dispatch region with an attached workgroup_count region. -/// -/// Note: the matching is enforced to be exact (i.e. no other compute ops may -/// exist under variantH). This is consistent with application confined within -/// the dispatch region, where we must not miss any op. -/// -/// Note: A future version of this op will be able to directly apply on the DAG -/// and form the dispatch region. -std::tuple -buildReductionStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH, - ArrayRef workgroupTileSizes); - -/// Build transform IR that applies memory optimizations. -Value buildMemoryOptimizations(ImplicitLocOpBuilder &b, Value funcH); - -} // namespace mlir::iree_compiler - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_COMMON_COMMON_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp deleted file mode 100644 index fcba3716fd15..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h" - -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -/// Options to set the default values of the matmul strategy. - -static llvm::cl::list clBlockTileSizes( - "td-matmul-strategy-blk-sizes", - llvm::cl::desc("block tile size for dims (x,y,z) for the transform " - "dialect matmul strategy"), - llvm::cl::CommaSeparated); -static llvm::cl::opt clReductionTileSize( - "td-matmul-strategy-reduc-size", - llvm::cl::desc( - "reduction tile sized for the transform dialect matmul strategy")); -static llvm::cl::list clNumThreads( - "td-matmul-strategy-num-threads", - llvm::cl::desc("number of threads for dims (x,y,z) for the transform " - "dialect matmul strategy"), - llvm::cl::CommaSeparated); -static llvm::cl::list clNumWarps( - "td-matmul-strategy-num-warps", - llvm::cl::desc("number of warps for dims (x,y,z) for the transform " - "dialect matmul strategy"), - llvm::cl::CommaSeparated); -static llvm::cl::opt clUseAsyncCopies( - "td-matmul-strategy-use-async-copies", - llvm::cl::desc( - "use asynchronous copies for the transform dialect matmul strategy")); -static llvm::cl::opt clUseMmaSync( - "td-matmul-strategy-use-mma-sync", - llvm::cl::desc("use mma sync for the transform dialect matmul strategy")); -static llvm::cl::opt clUseWmma( - "td-matmul-strategy-use-wmma", - llvm::cl::desc("use wmma for the transform dialect matmul strategy")); -static llvm::cl::opt clUseFma( - "td-matmul-strategy-use-fma", - llvm::cl::desc("use fma for the transform dialect matmul strategy")); -static llvm::cl::opt clPipelineDepth( - "td-matmul-strategy-pipeline-depth", - llvm::cl::desc("pipeline depth for the transform dialect matmul strategy")); -static llvm::cl::opt clPeelPipelineEpilogue( - "td-matmul-strategy-peel-pipeline-epilogue", - llvm::cl::desc("whether to peel the pipeline epilogue for the transform " - "dialect matmul strategy")); - -using iree_compiler::gpu::AbstractGemmLikeStrategy; - -/// Key function for vtable. -AbstractGemmLikeStrategy::~AbstractGemmLikeStrategy() {} - -void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) { - blockTileSizes = - SmallVector{clBlockTileSizes.begin(), clBlockTileSizes.end()}; - numThreads = SmallVector{clNumThreads.begin(), clNumThreads.end()}; - numWarps = SmallVector{clNumWarps.begin(), clNumWarps.end()}; - reductionTileSize = clReductionTileSize; - useAsyncCopies = clUseAsyncCopies; - useMmaSync = clUseMmaSync; - useWmma = clUseWmma; - useFma = clUseFma; - pipelineDepth = clPipelineDepth; - peelPipelineEpilogue = clPeelPipelineEpilogue; - - /// cliOptionsSpecified is used to override hard-coded well known good - /// defaults when set. - if (clBlockTileSizes.getNumOccurrences() || - clNumThreads.getNumOccurrences() || clNumWarps.getNumOccurrences() || - clReductionTileSize.getNumOccurrences() || - clUseAsyncCopies.getNumOccurrences() || - clUseMmaSync.getNumOccurrences() || clUseWmma.getNumOccurrences() || - clUseFma.getNumOccurrences() || clPipelineDepth.getNumOccurrences() || - clPeelPipelineEpilogue.getNumOccurrences()) { - cliOptionsSpecified = true; - } - - /// If not specified, select instructions to target for compute. - if (!useMmaSync && !useWmma && !useFma) { - /// First, try to use tensor core. - if (getLhsElementalType() == getRhsElementalType()) { - /// Currently all supported targets at least have WMMA. - /// TODO: Handle targets without tensor core. - if (gpuModel.hasMmaSync) - useMmaSync = true; - else - useWmma = true; - } else { - /// Mixed precision only supported by fma. - useFma = true; - } - } - - /// Prefer smaller subgroup sizes for tensor core strategies. - if (!useFma) - targetSubgroupSize = gpuModel.minSubgroupSize; - - /// Default configuration based on hardware properties and problem bit widths. - if (clBlockTileSizes.getNumOccurrences()) { - blockTileSizes = - SmallVector(clBlockTileSizes.begin(), clBlockTileSizes.end()); - } else { - blockTileSizes = SmallVector{128, 128, 1}; - } - - if (clNumThreads.getNumOccurrences()) { - numThreads = SmallVector(clNumThreads.begin(), clNumThreads.end()); - } else { - // Infer from warp counts if present. - if (clNumWarps.getNumOccurrences()) { - numThreads = SmallVector(clNumWarps.begin(), clNumWarps.end()); - numThreads[0] *= getSubgroupSize(); - } else { - numThreads = SmallVector{64, 2, 1}; - } - } - if (clNumWarps.getNumOccurrences()) { - numWarps = SmallVector(clNumWarps.begin(), clNumWarps.end()); - } else { - numWarps = numThreads; - numWarps[0] = llvm::divideCeil(numWarps[0], getSubgroupSize()); - } - if (clUseAsyncCopies.getNumOccurrences()) - useAsyncCopies = clUseAsyncCopies; - else - useAsyncCopies = gpuModel.hasMmaSync; - if (clUseMmaSync.getNumOccurrences()) - useMmaSync = clUseMmaSync; - if (clUseWmma.getNumOccurrences()) - useWmma = clUseWmma; - if (clUseFma.getNumOccurrences()) - useFma = clUseFma; - if (clReductionTileSize.getNumOccurrences()) { - reductionTileSize = clReductionTileSize; - } else { - reductionTileSize = 16; - if (!useFma) { - int64_t maxInputWidth = - std::max(lhsElementalBitWidth(), rhsElementalBitWidth()); - assert(maxInputWidth <= 32 && "requires <= 32-bit types"); - reductionTileSize *= (32 / maxInputWidth); - } - } - if (clPipelineDepth.getNumOccurrences()) { - pipelineDepth = clPipelineDepth; - } else { - pipelineDepth = 0; - if (useAsyncCopies) - pipelineDepth = 3; - } -} - -ArrayAttr -AbstractGemmLikeStrategy::getZeroPadAttrFromElementalTypes(OpBuilder &b) const { - SmallVector paddingValues; - for (Type t : paddingValueTypes) - paddingValues.push_back(b.getZeroAttr(t)); - return b.getArrayAttr(paddingValues); -} - -//===--------------------------------------------------------------------===// -// Validation of support for the configured strategy. -//===--------------------------------------------------------------------===// - -LogicalResult -AbstractGemmLikeStrategy::validate(const GPUModel &gpuModel) const { - if (totalNumThreads() != totalNumWarps() * getSubgroupSize()) { - llvm::errs() << "Number of threads specified by warps must match total " - "number of threads\n"; - return failure(); - } - if (m() < blockTileM()) { - llvm::errs() << "m(" << m() << ") < blockTileM(" << blockTileM() << ") "; - llvm::errs() << "this is at risk of not vectorizing and is NYI"; - return failure(); - } - if (n() < blockTileN()) { - llvm::errs() << "n(" << n() << ") < blockTileN(" << blockTileN() << ") "; - llvm::errs() << "this is at risk of not vectorizing and is NYI"; - return failure(); - } - if (k() < reductionTileSize) { - llvm::errs() << "k(" << k() << ") < reductionTileSize(" << reductionTileSize - << ") "; - llvm::errs() << "this is at risk of not vectorizing and is NYI"; - return failure(); - } - - if (failed(validateLhsCopyMapping())) { - llvm::errs() << "invalid lhs copy mapping"; - return failure(); - } - if (failed(validateRhsCopyMapping())) { - llvm::errs() << "invalid rhs copy mapping"; - return failure(); - } - if (failed(validateResCopyMapping())) { - llvm::errs() << "invalid res copy mapping"; - return failure(); - } - - if (pipelineDepth > 1 && reductionTileSize * pipelineDepth > k()) { - llvm::errs() << "pipeline depth " << pipelineDepth - << " too large for reduction tile size " << reductionTileSize - << " given k " << k(); - return failure(); - } - - bool oneOption = - (useMmaSync ^ useWmma ^ useFma) && !(useMmaSync && useWmma && useFma); - if (!oneOption) { - llvm::errs() << "at most one of useMmaSync, useWmma, useFma can be true"; - return failure(); - } - - if (useMmaSync) { - if (blockTileM() < kMinMmaSyncMinM) { - llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinM - << " block tile size in M"; - return failure(); - } - if (blockTileN() < kMinMmaSyncMinN) { - llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinN - << " block tile size in N"; - return failure(); - } - if (reductionTileSize < kMinMmaSyncMinK) { - llvm::errs() << "mma.sync requires at least " << kMinMmaSyncMinK - << " block tile size in K"; - return failure(); - } - if (pipelineDepth > 1 && pipelineDepth < kMinMmaSyncPipelineDepth) { - llvm::errs() << "mma.sync pipelining requires at least " - << kMinMmaSyncPipelineDepth << " stages"; - return failure(); - } - if (pipelineDepth > 1 && reductionTileSize * kMinMmaSyncGroups > k()) { - llvm::errs() << "mma.sync pipelining requires at least " - << kMinMmaSyncGroups << " k groups"; - return failure(); - } - } else if (useWmma) { - if (blockTileM() < kMinWmmaMinM) { - llvm::errs() << "wmma requires at least " << kMinWmmaMinM - << " block tile size in M"; - return failure(); - } - if (blockTileN() < kMinWmmaMinN) { - llvm::errs() << "wmma requires at least " << kMinWmmaMinN - << " block tile size in N"; - return failure(); - } - if (reductionTileSize < kMinWmmaMinK) { - llvm::errs() << "wmma requires at least " << kMinWmmaMinK - << " block tile size in K"; - return failure(); - } - } - return success(); -} - -//===--------------------------------------------------------------------===// -// Strategy printing for debugging. -//===--------------------------------------------------------------------===// - -LLVM_DUMP_METHOD void AbstractGemmLikeStrategy::dump() const { - print(llvm::errs()); -} - -void AbstractGemmLikeStrategy::print(llvm::raw_ostream &os) const { - os << "- forced by CLI specification: " - << (cliOptionsSpecified ? "true" : "false") << "\n"; - os << "- block tile sizes: {"; - bool isFirst = true; - for (int64_t blockTileSize : blockTileSizes) { - if (!isFirst) - os << ", "; - os << blockTileSize; - isFirst = false; - } - os << "}\n"; - os << "- reduction tile size: " << reductionTileSize << '\n'; - - os << "- number of threads: {"; - isFirst = true; - for (int64_t numThreadsForDim : numThreads) { - if (!isFirst) - os << ", "; - os << numThreadsForDim; - isFirst = false; - } - os << "}\n"; - - os << "- number of warps: {"; - isFirst = true; - for (int64_t numWarpsForDim : numWarps) { - if (!isFirst) - os << ", "; - os << numWarpsForDim; - isFirst = false; - } - os << "}\n"; - os << "- use async copies: " << useAsyncCopies << '\n'; - os << "- use fma: " << useFma << '\n'; - os << "- use wmma: " << useWmma << '\n'; - os << "- use mma sync: " << useMmaSync << '\n'; - os << "- pipeline depth: " << pipelineDepth << '\n'; - - os << "\n-- Derived quantities --\n"; - os << "- lhs copy:\n"; - lhsCopyMapping().print(os << " -> "); - os << "\n- rhs copy:\n"; - rhsCopyMapping().print(os << " -> "); - os << "\n- res copy:\n"; - resCopyMapping().print(os << " -> "); - os << "\n"; -} - -/// Validates the mapping and emits a diagnostic on failure. -LogicalResult AbstractGemmLikeStrategy::validateCopyMapping( - MLIRContext *ctx, const MappingInfo &mapping, StringRef name) const { - int64_t threadsUsed = - std::accumulate(mapping.numThreads.begin(), mapping.numThreads.end(), 1, - std::multiplies()); - if (totalNumThreads() < threadsUsed) { - InFlightDiagnostic diag = emitError(UnknownLoc::get(ctx)) - << "too many threads used for transferring " - << name; - - std::string str; - llvm::raw_string_ostream os(str); - llvm::interleave(mapping.numThreads, os, " * "); - os << " >= " << totalNumThreads(); - diag.attachNote() << os.str(); - return diag; - } - - return success(); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h deleted file mode 100644 index ed7af71bb93b..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_ - -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" - -namespace mlir::iree_compiler::gpu { - -struct AbstractGemmLikeStrategy : GPUStrategy { - AbstractGemmLikeStrategy(const GPUModel &gpuModel) : GPUStrategy(gpuModel) {} - - virtual ~AbstractGemmLikeStrategy(); - - //===--------------------------------------------------------------------===// - // Helpers and parameters for configuring the strategy. - //===--------------------------------------------------------------------===// - - /// Initialize values from the CLI. Set cliOptionsSpecified to true if the - /// default CLI values have been overriden. - virtual void initDefaultValues(const GPUModel &gpuModel); - - /// Encodes whether the user has specified any CLI options. When true, the - /// strategy should just run what was specified and is not allowed to - /// override the user's choices. - bool cliOptionsSpecified = false; - - /// Non-default subgroup size to use configured based on hardware supported - /// values. - std::optional targetSubgroupSize = std::nullopt; - - int64_t getSubgroupSize() const { - return targetSubgroupSize ? *targetSubgroupSize : subgroupSize; - } - - //===--------------------------------------------------------------------===// - // Parameters that control the tiling and mapping. - //===--------------------------------------------------------------------===// - - /// Tile sizes for the workgroup / determines grid size for all known - /// reduction strategies. The initial values are set by initDefaultValues(); - SmallVector blockTileSizes; - int64_t reductionTileSize; - SmallVector numThreads; - SmallVector numWarps; - virtual int64_t blockTileM() const = 0; - virtual int64_t blockTileN() const = 0; - - virtual int64_t numWarpsX() const = 0; - virtual int64_t numWarpsY() const = 0; - - virtual MappingInfo getBlockMapping() const = 0; - - /// Common values based on derived quantities. - int64_t totalNumThreads() const { - int64_t res = 1; - for (auto v : numThreads) - res *= v; - return res; - } - - int64_t totalNumWarps() const { - int64_t res = 1; - for (auto v : numWarps) - res *= v; - return res; - } - - //===--------------------------------------------------------------------===// - // Parameters that control copy/padding transfers from global to shared. - //===--------------------------------------------------------------------===// - SmallVector paddingValueTypes; - SmallVector paddingDimensions; - SmallVector packingDimensions; - - ArrayAttr getZeroPadAttrFromElementalTypes(OpBuilder &b) const; - - virtual Type getLhsElementalType() const = 0; - virtual Type getRhsElementalType() const = 0; - virtual Type getResElementalType() const = 0; - - int64_t lhsElementalBitWidth() const { - return getLhsElementalType().getIntOrFloatBitWidth(); - } - int64_t rhsElementalBitWidth() const { - return getRhsElementalType().getIntOrFloatBitWidth(); - } - int64_t resElementalBitWidth() const { - return getResElementalType().getIntOrFloatBitWidth(); - } - - bool alignedLhs() const { - return m() % blockTileM() == 0 && k() % reductionTileSize == 0; - } - bool alignedRhs() const { - return n() % blockTileN() == 0 && k() % reductionTileSize == 0; - } - bool alignedRes() const { - return m() % blockTileM() == 0 && n() % blockTileN() == 0; - } - - virtual MappingInfo lhsCopyMapping() const = 0; - virtual LogicalResult validateLhsCopyMapping() const = 0; - virtual MappingInfo rhsCopyMapping() const = 0; - virtual LogicalResult validateRhsCopyMapping() const = 0; - virtual MappingInfo resCopyMapping() const = 0; - virtual LogicalResult validateResCopyMapping() const = 0; - - /// Validates the mapping and emits a diagnostic on failure. - LogicalResult validateCopyMapping(MLIRContext *ctx, - const MappingInfo &mapping, - StringRef name) const; - - //===--------------------------------------------------------------------===// - // Parameters that control compute mapping decisions. - //===--------------------------------------------------------------------===// - bool useAsyncCopies; - bool useMmaSync; - bool useWmma; - bool useFma; - int64_t pipelineDepth; - bool peelPipelineEpilogue; - virtual MappingInfo computeMapping() const = 0; - - virtual LogicalResult validate(const GPUModel &gpuModel) const; - - //===--------------------------------------------------------------------===// - // Problem-related quantities. - //===--------------------------------------------------------------------===// - virtual int64_t m() const = 0; - virtual int64_t n() const = 0; - virtual int64_t k() const = 0; - - virtual void print(llvm::raw_ostream &os) const = 0; - virtual LLVM_DUMP_METHOD void dump() const = 0; - - //===--------------------------------------------------------------------===// - // Preconditions of internal transforms lifted to the top-level for more - // actionnable error messages. In the fullness of time, transforms should - // expose preconditions and we should aggregate them automatically. - //===--------------------------------------------------------------------===// - - // TODO: To handle different element types efficiently, it would be much - // better to expose the unrolling to native size explicitly to the transforms - // rather than hide it behind an opaque transform. - - // wmma preconditions that we want to lift out in an actionnable top-level - // error message instead of failing late in the transformation schedule. - // TODO: These are now hardcoded for f32 but are element-type dependent. - // Precondition: the pipeline transformation for wmma requires at least 2 - // k-groups. - constexpr static int64_t kMinWmmaMinM = 16; - constexpr static int64_t kMinWmmaMinN = 16; - constexpr static int64_t kMinWmmaMinK = 8; - - // mma.sync preconditions that we want to lift out in an actionnable top-level - // error message instead of failing late in the transformation schedule. - // TODO: These are now hardcoded for f32 but are element-type dependent. - // Precondition: the pipeline transformation for mma.sync requires at least 2 - // k-groups. - constexpr static int64_t kMinMmaSyncGroups = 2; - // Precondition: the pipeline transformation for mma.sync requires at least a - // pipeline depth of 3. - constexpr static int64_t kMinMmaSyncPipelineDepth = 3; - // Precondition: if mma.sync is used, the tile sizes must be at least 8x8x4. - constexpr static int64_t kMinMmaSyncMinM = 8; - constexpr static int64_t kMinMmaSyncMinN = 8; - constexpr static int64_t kMinMmaSyncMinK = 4; -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_ABSTRACT_GEMM_LIKE_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel deleted file mode 100644 index 33ea8e9894ed..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2023 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library") - -package( - default_visibility = ["//visibility:public"], - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) - -iree_compiler_cc_library( - name = "GPU", - srcs = [ - "AbstractGemmLikeStrategy.cpp", - "Common.cpp", - "ConvolutionImplicitGemmStrategy.cpp", - "CopyMapping.cpp", - "MappingInfo.cpp", - "MatmulTensorCoreStrategy.cpp", - "PadStrategy.cpp", - "SmallReductionStrategy.cpp", - "StagedReductionStrategy.cpp", - "Strategies.cpp", - ], - hdrs = [ - "AbstractGemmLikeStrategy.h", - "Common.h", - "ConvolutionImplicitGemmStrategy.h", - "CopyMapping.h", - "MappingInfo.h", - "MatmulTensorCoreStrategy.h", - "PadStrategy.h", - "SmallReductionStrategy.h", - "StagedReductionStrategy.h", - "Strategies.h", - ], - deps = [ - # Dialects - "//compiler/src/iree/compiler/Dialect/LinalgExt/IR", - "//compiler/src/iree/compiler/Dialect/LinalgExt/TransformExtensions:LinalgExtExtensions", - "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect", - "@llvm-project//mlir:AffineDialect", - "@llvm-project//mlir:AffineUtils", - "@llvm-project//mlir:AsyncDialect", - "@llvm-project//mlir:ArithDialect", - "@llvm-project//mlir:ArithUtils", - "@llvm-project//mlir:BufferizationDialect", - "@llvm-project//mlir:BufferizationTransforms", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:FunctionInterfaces", - "@llvm-project//mlir:GPUDialect", - "@llvm-project//mlir:LinalgDialect", - "@llvm-project//mlir:LLVMDialect", - "@llvm-project//mlir:MemRefDialect", - "@llvm-project//mlir:MemRefTransformOps", - "@llvm-project//mlir:NVGPUDialect", - "@llvm-project//mlir:PDLDialect", - "@llvm-project//mlir:PDLInterpDialect", - "@llvm-project//mlir:SCFDialect", - "@llvm-project//mlir:SCFTransformOps", - "@llvm-project//mlir:SCFUtils", - "@llvm-project//mlir:TensorDialect", - "@llvm-project//mlir:TensorTransformOps", - "@llvm-project//mlir:TransformDialect", - "@llvm-project//mlir:VectorDialect", - "@llvm-project//mlir:VectorTransformOps", - # IR - "@llvm-project//mlir:Analysis", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Parser", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:Rewrite", - # Other Stuff - "@llvm-project//llvm:Support", - "@llvm-project//mlir:Support", - "@llvm-project//mlir:DialectUtils", - # TransformStrategies - "//compiler/src/iree/compiler/Codegen/TransformStrategies/Common:TransformStrategies", - # TransformExtensions - "//compiler/src/iree/compiler/Codegen/Common/TransformExtensions:CommonExtensions", - "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions", - "@llvm-project//mlir:LinalgTransformOps", - # TransformMatchers and other stuff - "//llvm-external-projects/iree-dialects:IREEDialectsTransforms", - ], -) diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt deleted file mode 100644 index 48b44f3ef353..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt +++ /dev/null @@ -1,82 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -iree_cc_library( - NAME - GPU - HDRS - "AbstractGemmLikeStrategy.h" - "Common.h" - "ConvolutionImplicitGemmStrategy.h" - "CopyMapping.h" - "MappingInfo.h" - "MatmulTensorCoreStrategy.h" - "PadStrategy.h" - "SmallReductionStrategy.h" - "StagedReductionStrategy.h" - "Strategies.h" - SRCS - "AbstractGemmLikeStrategy.cpp" - "Common.cpp" - "ConvolutionImplicitGemmStrategy.cpp" - "CopyMapping.cpp" - "MappingInfo.cpp" - "MatmulTensorCoreStrategy.cpp" - "PadStrategy.cpp" - "SmallReductionStrategy.cpp" - "StagedReductionStrategy.cpp" - "Strategies.cpp" - DEPS - IREEDialectsTransforms - IREELinalgTransformDialect - LLVMSupport - MLIRAffineDialect - MLIRAffineUtils - MLIRAnalysis - MLIRArithDialect - MLIRArithUtils - MLIRAsyncDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms - MLIRFuncDialect - MLIRFunctionInterfaces - MLIRGPUDialect - MLIRIR - MLIRLLVMDialect - MLIRLinalgDialect - MLIRLinalgTransformOps - MLIRMemRefDialect - MLIRMemRefTransformOps - MLIRNVGPUDialect - MLIRPDLDialect - MLIRPDLInterpDialect - MLIRParser - MLIRPass - MLIRRewrite - MLIRSCFDialect - MLIRSCFTransformOps - MLIRSCFUtils - MLIRSupport - MLIRTensorDialect - MLIRTensorTransformOps - MLIRTransformDialect - MLIRVectorDialect - MLIRVectorTransformOps - iree::compiler::Codegen::Common::TransformExtensions::CommonExtensions - iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions - iree::compiler::Codegen::TransformStrategies::Common::TransformStrategies - iree::compiler::Dialect::LinalgExt::IR - iree::compiler::Dialect::LinalgExt::TransformExtensions::LinalgExtExtensions - PUBLIC -) - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp deleted file mode 100644 index 30322269d80d..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp +++ /dev/null @@ -1,697 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" - -#include - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h" -#include "llvm/Support/ErrorHandling.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" -#include "mlir/IR/TypeUtilities.h" - -using namespace mlir; - -// TODO: significantly better namespacing. -using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp; -using iree_compiler::IREE::transform_dialect::MapNestedForallToGpuThreadsOp; -using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op; -using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp; - -using iree_compiler::buildReductionStrategyBlockDistribution; -using iree_compiler::buildTileFuseDistToForallWithNumThreads; -using iree_compiler::buildTileFuseDistToForallWithTileSizes; -using iree_compiler::maxDivisorOfValueBelowLimit; -using iree_compiler::TileToForallAndFuseAndDistributeResult; -using iree_compiler::gpu::AbstractGemmLikeStrategy; -using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping; -using iree_compiler::gpu::buildCommonTrailingStrategy; -using iree_compiler::gpu::buildMapToBlockAndThreads; -using iree_compiler::gpu::GPUModel; -using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp; -using iree_compiler::IREE::transform_dialect::IREEBufferizeOp; -using iree_compiler::IREE::transform_dialect::IREEEliminateEmptyTensorsOp; -using iree_compiler::IREE::transform_dialect::ShareForallOperandsOp; -using iree_compiler::IREE::transform_dialect::SynchronizeLoopOp; -using transform::FuseIntoContainingOp; -using transform::MatchOp; -using transform::MemRefEraseDeadAllocAndStoresOp; -using transform::RewriteInDestinationPassingStyleOp; -using transform::ScalarizeOp; -using transform::SequenceOp; - -//===----------------------------------------------------------------------===// -// General helpers. -//===----------------------------------------------------------------------===// - -/// Return max(1, (value * 32) / bitwidth). -int64_t mlir::iree_compiler::gpu::scaleUpByBitWidth(int64_t value, - int64_t bitWidth) { - assert((bitWidth & (bitWidth - 1)) == 0 && "bitWidth must be a power of 2"); - return std::max((value * 32) / bitWidth, int64_t(1)); -} - -/// Adjust the number of warps to use to benefit from packing multiple smaller -/// elemental types within a single 128 bit shuffled element. -int64_t mlir::iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle( - int64_t numWarpsToUse, int64_t bitWidth) { - // Try to scale down the number of warps to use 32b elements in warp shuffles. - assert(((bitWidth & (bitWidth - 1)) == 0) && "bitWidth must be a power of 2"); - int64_t factor; - for (factor = scaleUpByBitWidth(1, bitWidth); factor > 1; factor >>= 1) - if (numWarpsToUse % factor == 0) - break; - numWarpsToUse /= factor; - // Try to scale to using 128b elements in warp shuffles. - return std::max(numWarpsToUse / 4, int64_t(1)); -} - -/// Compute the (splitPoint, vectorSize) pair to break [0 .. upperBound] into -/// [0 .. splitPoint] and [splitPoint + 1 .. upperBound] such that `splitPoint` -/// is a multiple of `fixedSize * vectorSize`. -/// The returned `vectorSize` is the maximal power of `2`, smaller than -/// `maxVectorSize`, for which `splitPoint` can be computed. -/// -/// Note: `vectorSize` may be smaller than `maxVectorSize` when the upperBound -/// is small enough. In such cases we give preference to keeping the `fixedSize` -/// parameter unchanged and reducing the `vectorSize`. `fixedSize` generally -/// captures the number of threads and we do not alter decisions on parallelism -/// at this level. -/// -/// If such a positive multiple exists: -/// 1. if it is `upperBound`, then `upperBound` is an even multiple of -/// `fixedSize` * `vectorSize` and we can tile evenly without splitting. -/// In this case we return (0, vectorSize). -/// 2. otherwise, it is a split point at which we can split with vectorSize -/// to obtain the largest divisible tiling. -/// In this case we return (splitPoint, vectorSize). -/// Otherwise we return (0, 1) to signify no splitting and a vector size of 1. -// TODO: support the dynamic case, taking future stride and alignment into -// account and returning Values. The op then needs to become part of the -// transform dialect. -static std::pair computeSplitPoint(int64_t upperBound, - int64_t fixedSize, - int64_t maxVectorSize) { - assert((maxVectorSize & (maxVectorSize - 1)) == 0 && "must be a power of 2"); - if (ShapedType::isDynamic(upperBound)) { - return std::make_pair(int64_t(0), int64_t(1)); - } - for (int64_t vectorSize = maxVectorSize; vectorSize >= 1; vectorSize >>= 1) { - int64_t splitPoint = - iree_compiler::previousMultipleOf(upperBound, fixedSize * vectorSize); - if (splitPoint > 0) { - return (upperBound == splitPoint) - ? std::make_pair(int64_t(0), vectorSize) - : std::make_pair(splitPoint, vectorSize); - } - } - return std::make_pair(int64_t(0), int64_t(1)); -} - -//===----------------------------------------------------------------------===// -// Low-level reusable retargetable builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// -/// Post-bufferization mapping to blocks and threads. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -Value mlir::iree_compiler::gpu::buildMapToBlockAndThreads( - ImplicitLocOpBuilder &b, Value funcH, ArrayRef blockSize, - std::optional subgroupSize) { - b.create(funcH); - auto mapToThreadsOp = - b.create(funcH, blockSize); - if (subgroupSize) - mapToThreadsOp.setSubgroupSize(*subgroupSize); - return funcH; -} - -/// Post-bufferization vector distribution with rank-reduction. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -Value mlir::iree_compiler::gpu::buildDistributeVectors(ImplicitLocOpBuilder &b, - Value variantH, - Value funcH, - int64_t warpSize) { - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - }); - Value ifH = b.create(funcH, scf::IfOp::getOperationName()); - // Locally suppress failures for this op only because it doesn't cover the - // `threadIdx.x == 0 && threadIdx.y == 0` case at the moment. - auto sequence = b.create( - TypeRange(), transform::FailurePropagationMode::Suppress, variantH, - /*extraBindings=*/ValueRange()); - { - OpBuilder::InsertionGuard guard(b); - b.createBlock(&sequence.getBody(), sequence.getBody().begin(), - transform::AnyOpType::get(b.getContext()), b.getLoc()); - ifH = b.create(ifH, warpSize); - b.create(); - } - b.create(funcH); - return funcH; -} - -//===----------------------------------------------------------------------===// -// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// -void mlir::iree_compiler::gpu:: - build1DSplittingStrategyWithOptionalThreadMapping( - ImplicitLocOpBuilder &b, Value variantH, Value opH, int64_t rank, - int64_t mostMinorDim, SmallVector opSizes, int64_t numThreads, - Attribute mappingAttr, int64_t maxVectorSize) { - // Poor man's handling of optionality in C++. Will need to be converted to - // proper transform dialect filters or handling of emptiness. - if (rank == 0) - return; - - // Compute split point to guarantee we form a maximal chunk divisible by - // numThreads * vectorSize. - // This chunk is currently not aligned for proper vector accesses. - // In the future, this can be solved either by: - // 1. doing an extra prologue split that is cognizant of the future stride. - // 2. or, aligning allocations to a multiple of 128b on the most minor - // dimensions but without changing problem sizes (i.e. poor man's - // packing). - int64_t mostMinorSize = opSizes[mostMinorDim]; - auto [splitPoint, vectorSize] = computeSplitPoint( - /*upperBound=*/mostMinorSize, /*fixedSize=*/numThreads, - /*maxVectorSize=*/maxVectorSize); - - // Create 1-D tile sizes for the first, divisible, part. - SmallVector scfForTileSizes(rank, 0), foreachTileSizes(rank, 0); - scfForTileSizes[mostMinorDim] = numThreads * vectorSize; - foreachTileSizes[mostMinorDim] = numThreads; - - // Split, tile and map the most minor dimension to `mappingAttr`. - if (splitPoint > 0) { - auto anyOpType = transform::AnyOpType::get(b.getContext()); - auto split = b.create( - anyOpType, anyOpType, opH, mostMinorDim, Value(), splitPoint); - opH = split.getFirst(); - if (vectorSize > 1) { - auto res = iree_compiler::buildTileFuseToScfFor( - /*b=*/b, - /*variantH=*/variantH, - /*rootH=*/opH, - /*opsHToFuse=*/{}, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr({scfForTileSizes}))); - opH = res.tiledOpH; - // Reset the vector size to 1 for the tail, which is known to not be - // divisible by `numThreads * vectorSize`. - vectorSize = 1; - } - if (numThreads > 1) { - assert(mappingAttr && "must specify a mapping attribute"); - iree_compiler::buildTileFuseDistToForallWithNumThreads( - /*b=*/b, - /*variantH=*/variantH, - /*rootH=*/opH, - /*opsHToFuse=*/{}, - /*numThreads=*/getAsOpFoldResult(b.getI64ArrayAttr(foreachTileSizes)), - /*threadDimMapping=*/b.getArrayAttr({mappingAttr})); - } - opH = split.getSecond(); - } - - // Tile and map the most minor dimension of the remainder to mappingAttr. - if (vectorSize > 1) { - auto res = iree_compiler::buildTileFuseToScfFor( - /*b=*/b, - /*variantH=*/variantH, - /*rootH=*/opH, - /*opsHToFuse=*/{}, - /*tileSizes=*/getAsOpFoldResult(b.getI64ArrayAttr({scfForTileSizes}))); - opH = res.tiledOpH; - } - if (numThreads > 1) { - assert(mappingAttr && "must specify a mapping attribute"); - iree_compiler::buildTileFuseDistToForallWithNumThreads( - /*b=*/b, - /*variantH=*/variantH, - /*rootH=*/opH, - /*opsHToFuse=*/{}, - /*numThreads=*/getAsOpFoldResult(b.getI64ArrayAttr(foreachTileSizes)), - /*threadDimMapping=*/b.getArrayAttr({mappingAttr})); - } -} - -/// Take care of the last common steps in a GPU strategy (i.e. vectorize, -/// bufferize, maps to blocks and threads and distribute vectors). -/// Return the handles to the updated variant and the func::FuncOp ops under -/// the variant op. -std::pair mlir::iree_compiler::gpu::buildCommonTrailingStrategy( - ImplicitLocOpBuilder &b, Value variantH, - ArrayRef numThreadsInBlock) { - Value funcH = b.create(variantH, func::FuncOp::getOperationName()); - - // Step N-5. Fold tensor.empty to avoid large allocations. - // Step N-4. Perform a pass of canonicalization + enabling after tiling. - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - funcH = iree_compiler::buildVectorize(b, funcH); - - // Step N-3. Perform a pass of canonicalization + enabling after vectorization - // as well as hoisting subset operations such as vector.transfer_read/write. - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - iree_compiler::buildHoisting(b, funcH); - - // Step N-2. Bufferize and drop HAL descriptor from memref ops. - variantH = iree_compiler::buildBufferize(b, variantH, /*targetGpu=*/true); - - // Step N-1. Post-bufferization mapping to blocks and threads. - // Need to match again since bufferize invalidated all handles. - // TODO: assumes a single func::FuncOp to transform, may need hardening. - funcH = b.create(variantH, func::FuncOp::getOperationName()); - funcH = buildMapToBlockAndThreads(b, funcH, numThreadsInBlock); - - // Step N. Perform a final pass of canonicalization + enabling before - // returning. - mlir::iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - return std::make_pair(variantH, funcH); -} - -//===----------------------------------------------------------------------===// -// Subset of mid-level builders currently used for GEMM-like problems. -//===----------------------------------------------------------------------===// - -/// Build transform IR to hoist the padded output operand of a padded matmul. -/// Additionally, this attempts to fold the padding into the producing fill, if -/// available. -Value mlir::iree_compiler::gpu::buildHoistOutputPaddingOp( - ImplicitLocOpBuilder &b, Value variantH, Value paddedMatmulOpH, - int64_t numLoopsToHoist) { - // Find the output pad and hoist it. - // TODO: don't hardcode output operand number. - // TODO: Better builders. - Value outputH = b.create( - paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(2)); - - // Hoist the padding above the 1 innermost reduction loop. - auto padOpType = transform::OperationType::get( - b.getContext(), tensor::PadOp::getOperationName()); - outputH = b.create(padOpType, outputH); - b.create(paddedMatmulOpH.getType(), outputH, - numLoopsToHoist); - - // Perform a pass of canonicalization cleanups + folding fill + pad into pad - // by applying `foldTensorSubsets` and `tilingCanonicalization`. - { - Value funcH = b.create( - variantH, func::FuncOp::getOperationName()); - iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create< - transform::ApplyMergeConsecutiveInsertExtractSlicePatternsOp>( - loc); - }); - } - - // The canonicalization above should have rewritten hoistPad into a FillOp. - // Unfortunately, the listener drops handles if the op types don't match. We - // need better behavior here, for now we rematch. - // TODO: use value handles. - Value fillOpH = b.create( - variantH, linalg::FillOp::getOperationName()); - - return fillOpH; -} - -/// Helper function to distribute one pad or copy operation. -/// Note: When `foldIfBranch` is true, one must later perform masked -/// vectorization of the result. -/// This amounts to injecting knowledge about future transformations without -/// adding leaky semantics. -std::tuple -mlir::iree_compiler::gpu::buildDistributeOnePadOrCopyWithTileSizes( - ImplicitLocOpBuilder &b, Value variantH, Value copyOpH, - ArrayRef tileSizes, ArrayRef threadDimMapping, - bool foldIfBranch) { - TileToForallAndFuseAndDistributeResult res = - buildTileFuseDistToForallWithTileSizes( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/copyOpH, - /*opsToFuseH=*/{}, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)), - /*threadDimMapping=*/ - b.getArrayAttr(threadDimMapping)); - if (foldIfBranch) { - Value ifOpH = b.create(res.forallH, - scf::IfOp::getOperationName()); - b.create( - ifOpH, /*takeElseBranch=*/b.getUnitAttr()); - } - return std::make_tuple(res.tiledOpH, res.forallH); -} - -/// Helper function to distribute one pad or copy operation. -/// Note: When `foldIfBranch` is true, one must later perform masked -/// vectorization of the result. -/// This amounts to injecting knowledge about future transformations without -/// adding leaky semantics. -Value mlir::iree_compiler::gpu::buildDistributeOnePadOrCopyWithNumThreads( - ImplicitLocOpBuilder &b, Value variantH, Value copyOpH, - ArrayRef numThreads, ArrayRef threadDimMapping, - bool foldIfBranch) { - TileToForallAndFuseAndDistributeResult res = - buildTileFuseDistToForallWithNumThreads( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/copyOpH, - /*opsToFuseH=*/{}, - /*numThreads=*/ - getAsOpFoldResult(b.getI64ArrayAttr(numThreads)), - /*threadDimMapping=*/ - b.getArrayAttr(threadDimMapping)); - if (foldIfBranch) { - Value ifOpH = b.create(res.forallH, - scf::IfOp::getOperationName()); - b.create( - ifOpH, /*takeElseBranch=*/b.getUnitAttr()); - } - return res.tiledOpH; -} - -/// Distribute the explicit copies involved in a matmul operation -/// `paddedMatmulOpH`. -std::tuple -mlir::iree_compiler::gpu::buildDistributeMatmulCopies( - ImplicitLocOpBuilder &b, Value variantH, Value paddedMatmulOpH, - const AbstractGemmLikeStrategy &strategy) { - // Aligned vs unaligned handling deviates here by converting the pads to - // copies for the aligned case. - // TODO: Unify aligned and unaligned codegen. - Value copyBackOpH; - if (!strategy.alignedRes()) { - // Explicitly materialize the parent parallel_insert into a copy to avoid - // late bufferization interferences. - // TODO: Avoid brittle rematching. - Value insertSliceH = b.create( - variantH, tensor::ParallelInsertSliceOp::getOperationName()); - copyBackOpH = b.create( - insertSliceH.getType(), insertSliceH); - } else { - Value resH = b.create( - paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(2)); - copyBackOpH = - b.create(resH.getType(), resH); - } - - Value lhsH = b.create( - paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(0)); - Value rhsH = b.create( - paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(1)); - - // Rewrite aligned pads as destination passing (linalg.copy) - if (strategy.alignedLhs() && strategy.packingDimensions[0]) - lhsH = b.create(lhsH.getType(), lhsH); - if (strategy.alignedRhs() && strategy.packingDimensions[1]) - rhsH = b.create(rhsH.getType(), rhsH); - - MappingInfo lhsCopyMapping = strategy.lhsCopyMapping(); - Value lhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads( - b, variantH, lhsH, /*numThreads=*/lhsCopyMapping.numThreads, - /*threadDimMapping=*/lhsCopyMapping.threadMapping, - /*foldIfBranch=*/!strategy.alignedLhs()); - - MappingInfo rhsCopyMapping = strategy.rhsCopyMapping(); - Value rhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads( - b, variantH, rhsH, /*numThreads=*/rhsCopyMapping.numThreads, - /*threadDimMapping=*/rhsCopyMapping.threadMapping, - /*foldIfBranch=*/!strategy.alignedRhs()); - - if (!strategy.alignedRes()) { - MappingInfo resCopyMapping = strategy.resCopyMapping(); - copyBackOpH = buildDistributeOnePadOrCopyWithNumThreads( - b, variantH, copyBackOpH, - /*numThreads=*/resCopyMapping.numThreads, - /*threadDimMapping=*/resCopyMapping.threadMapping); - } - - return std::make_tuple(lhsCopyOpH, rhsCopyOpH, copyBackOpH); -} - -/// Specific pattern to perform masked vectorization of copies give as -/// parameters, cleanup and vectorize the rest. -// TODO: generalize and don't hardcode. -void mlir::iree_compiler::gpu::buildMatmulVectorization( - ImplicitLocOpBuilder &b, Value variantH, Value lhsCopyOpH, Value rhsCopyOpH, - Value copyBackOpH, const AbstractGemmLikeStrategy &strategy, - bool vectorizePadding, bool vectorizeNdExtract) { - // Canonicalize to make padOp outputs static shaped: this is currently a - // prerequisite for vector masking. - // Also, no canonicalization is allowed after vector masking and before we - // lower the masks: masks are currently quite brittle and do not like - // canonicalization or anything else that may insert an op in their region. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - - // Apply vector masking. - if (!strategy.alignedLhs()) { - MappingInfo lhsCopyMapping = strategy.lhsCopyMapping(); - SmallVector scalableSizes(lhsCopyMapping.tileSizes.size(), false); - b.create(lhsCopyOpH, ValueRange(), - lhsCopyMapping.tileSizes, nullptr, - scalableSizes); - } - if (!strategy.alignedRhs()) { - MappingInfo rhsCopyMapping = strategy.rhsCopyMapping(); - SmallVector scalableSizes(rhsCopyMapping.tileSizes.size(), false); - b.create(rhsCopyOpH, ValueRange(), - rhsCopyMapping.tileSizes, nullptr, - scalableSizes); - } - if (!strategy.alignedRes()) { - MappingInfo resCopyMapping = strategy.resCopyMapping(); - SmallVector scalableSizes(resCopyMapping.tileSizes.size(), false); - b.create(copyBackOpH, ValueRange(), - resCopyMapping.tileSizes, nullptr, - scalableSizes); - } - - // Lower all masked vector transfers at this point, as they make - // canonicalization generate incorrect IR. - // TODO: don't rematch, apply on the variant op directly. - funcH = - b.create(variantH, func::FuncOp::getOperationName()); - buildLowerMaskedTransfersAndCleanup(b, funcH, /*cleanup=*/false); - - // Apply vectorization + cleanups to what remains. - funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true, - vectorizePadding, vectorizeNdExtract); -} - -/// Build the transform IR to perform conversion to tensor core operations. -/// This is currently subject to phase orderings as follows: -/// - Vector transfer_read and transfer_write patterns have different subview -/// folding behavior, force a fold_memref_aliases on them to enable -/// redundant vector transfer hoisting. -/// - Unfortunately, fold_memref_aliases breaks vector_to_mma conversion -/// across scf.for after unrolling due to insert_strided_slice / -/// extract_strided_slice across iter_args boundaries. -/// - Hoist redundant vector transfers to allow conversion to tensor core to -/// proceed. We really don't want to do this after bufferization but we need -/// to atm. -Value mlir::iree_compiler::gpu::buildConvertToTensorCoreOp( - ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy) { - // TODO: Fewer canonicalization. - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - b.create(funcH); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - if (strategy.useWmma) { - b.create( - funcH, [&](OpBuilder &b, Location loc) { - b.create(loc); - }); - } else if (strategy.useMmaSync) { - b.create( - funcH, [&](OpBuilder &b, Location loc) { - b.create(loc); - }); - } /* else nothing to do for fma here */ - - Value forH = b.create( - transform::OperationType::get(b.getContext(), "scf.for"), funcH, - b.getStrArrayAttr({scf::ForOp::getOperationName()}), - /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(), - /*opAttrs=*/DictionaryAttr(), /*filterResultType=*/TypeAttr(), - /*filterOperandTYpes=*/ArrayAttr()); - // TODO: At this time, this synchronization is needed for applying the - // HoistRedundantVectorTransfersOp transform correctly. This is because the - // transform does not take parallelism into accound. - // In the future, HoistRedundantVectorTransfersOp + SynchronizeLoopOp need to - // be replaced by a single transform. - b.create(forH); - - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create(funcH); - // TODO: not a functional style transform and avoid returning funcH. - funcH = b.create( - transform::AnyOpType::get(b.getContext()), funcH); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - b.create(funcH); - - if (strategy.useWmma) { - auto vectorToMMaConversionOp = b.create< - iree_compiler::IREE::transform_dialect::VectorToMMAConversionOp>(funcH); - // TODO: proper builder instead of a setting post-hoc. - vectorToMMaConversionOp.setUseWmma(true); - } else if (strategy.useMmaSync) { - auto vectorToMMaConversionOp = b.create< - iree_compiler::IREE::transform_dialect::VectorToMMAConversionOp>(funcH); - // TODO: proper builder instead of a setting post-hoc. - vectorToMMaConversionOp.setUseMmaSync(true); - } /* else nothing to do for fma here */ - - // Post-hoc elimiation of barriers. - funcH = b.create(funcH); - return funcH; -} - -void mlir::iree_compiler::gpu::buildMultiBuffering( - ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy) { - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - // TODO: Avoid brittle matching here. - // TODO: Better builder after integrate. - Value allocH = b.create( - transform::OperationType::get(b.getContext(), "memref.alloc"), funcH, - b.getStrArrayAttr({memref::AllocOp::getOperationName()}), - /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(), - /*opAttrs=*/DictionaryAttr(), /*filterResultType=*/TypeAttr(), - /*filterOperandTYpes=*/ArrayAttr()); - // TODO: Better builder instead of setting post-hoc. - auto multiBufferOp = b.create( - transform::AnyOpType::get(b.getContext()), allocH); - multiBufferOp.setFactor(strategy.pipelineDepth); - multiBufferOp.setSkipAnalysis(true); -} - -Value mlir::iree_compiler::gpu::buildConvertToAsyncCopies( - ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy) { - b.create(funcH, [&](OpBuilder &b, Location loc) { - // Atm, vectors need to be lowered to 1-D for cp.async mapping to connect. - // TODO: not a functional style op to avoid invalidating artificially. - auto transferToScfOp = - b.create(loc); - // TODO: proper builder instead of a setting post-hoc. - transferToScfOp.setMaxTransferRank(1); - transferToScfOp.setFullUnroll(true); - }); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - auto createAsyncGroupOp = - b.create( - TypeRange{}, funcH); - if (strategy.useMmaSync) { - // TODO: proper builder instead of a setting post-hoc. - createAsyncGroupOp.setUseMmaSync(strategy.useMmaSync); - } - iree_compiler::buildCanonicalizationAndEnablingTransforms( - b, funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - return funcH; -} - -void mlir::iree_compiler::gpu::buildPipelineSharedMemoryCopies( - ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy) { - Value computeOpH; - if (strategy.useWmma) { - computeOpH = b.create( - funcH, mlir::gpu::SubgroupMmaComputeOp::getOperationName()); - } else if (strategy.useMmaSync) { - computeOpH = b.create( - funcH, mlir::nvgpu::MmaSyncOp::getOperationName()); - } else { - assert(strategy.useFma); - computeOpH = b.create( - funcH, mlir::vector::ContractionOp::getOperationName()); - } - // TODO: Better builder. - Value forOpH = b.create( - transform::AnyOpType::get(b.getContext()), computeOpH, - /*isolated_from_above=*/false, /*allow_empty_results=*/false, - /*op_name=*/b.getStringAttr("scf.for"), /*deduplicate=*/true); - // TODO: Better builder instead of setting post-hoc. - auto pipelineOp = b.create< - iree_compiler::IREE::transform_dialect::PipelineSharedMemoryCopiesOp>( - transform::AnyOpType::get(b.getContext()), forOpH); - // TODO: depth from strategy, or directly from individual buffers. - pipelineOp.setDepth(strategy.pipelineDepth); - pipelineOp.setUseMmaSync(strategy.useMmaSync); - pipelineOp.setPeelEpilogue(strategy.peelPipelineEpilogue); -} - -Value mlir::iree_compiler::gpu::buildBufferize(ImplicitLocOpBuilder &b, - Value variantH) { - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - b.create( - funcH); - b.create(funcH); - b.create(funcH); - auto bufferizeOp = b.create(funcH, /*targetGpu=*/true); - bufferizeOp.setTargetGpu(true); - variantH = bufferizeOp.getResult(); - Value memrefFunc = - b.create(variantH, func::FuncOp::getOperationName()); - b.create(memrefFunc); - return variantH; -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h deleted file mode 100644 index 6d2934e0c342..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_ - -#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h" -#include "llvm/ADT/StringRef.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/IR/BuiltinOps.h" - -namespace mlir::iree_compiler::gpu { - -struct GPUModel; - -//===----------------------------------------------------------------------===// -// Base quantities generally useful for all GPU strategies. -//===----------------------------------------------------------------------===// -inline Attribute threadX(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimX); -} -inline Attribute threadY(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimY); -} -inline Attribute threadZ(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ); -} -inline Attribute warpX(MLIRContext *ctx) { - return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimX); -} -inline Attribute warpY(MLIRContext *ctx) { - return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimY); -} -inline Attribute warpZ(MLIRContext *ctx) { - return mlir::gpu::GPUWarpMappingAttr::get(ctx, mlir::gpu::MappingId::DimZ); -} -inline Attribute linearId0(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, - mlir::gpu::MappingId::LinearDim0); -} -inline Attribute linearId1(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, - mlir::gpu::MappingId::LinearDim1); -} -inline Attribute linearId2(MLIRContext *ctx) { - return mlir::gpu::GPUThreadMappingAttr::get(ctx, - mlir::gpu::MappingId::LinearDim2); -} - -//===----------------------------------------------------------------------===// -// General helpers. -//===----------------------------------------------------------------------===// -static constexpr int64_t kCudaMaxVectorLoadBitWidth = 128; - -/// Return max(1, (value * 32) / bitWidth). -int64_t scaleUpByBitWidth(int64_t value, int64_t bitWidth); - -/// Adjust the number of warps to use to benefit from packing multiple smaller -/// elemental types within a single 128 bit shuffled element. -int64_t adjustNumberOfWarpsForBlockShuffle(int64_t numWarpsToUse, - int64_t bitWidth); - -//===----------------------------------------------------------------------===// -// Low-level reusable retargetable builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// -/// Post-bufferization mapping to blocks and threads. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -/// Takes an optional `subgroupSize` argument to specify the number of threads -/// per subgroup. -Value buildMapToBlockAndThreads( - ImplicitLocOpBuilder &b, Value funcH, ArrayRef blockSize, - std::optional subgroupSize = std::nullopt); - -/// Post-bufferization vector distribution with rank-reduction. -/// Takes a handle to a func.func and returns an updated handle to a -/// func.func. -Value buildDistributeVectors(ImplicitLocOpBuilder &b, Value variantH, - Value funcH, int64_t warpSize); - -/// Take care of the last common steps in a GPU strategy (i.e. vectorize, -/// bufferize, maps to blocks and threads and distribute vectors). -/// Return the handles to the updated variant and the function ops under -/// the variant op. -// TODO: abstract away AbstractReductionStrategy, this is supposed to be -// retargetable. -std::pair -buildCommonTrailingStrategy(ImplicitLocOpBuilder &b, Value variantH, - ArrayRef numThreadsInBlock); - -//===----------------------------------------------------------------------===// -// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders. -//===----------------------------------------------------------------------===// -/// Take a handle `opH` to a Linalg op of rank `rank`, sizes `opSizes` and for -/// which we know the most minor dimension `mostMinorDim` (assuming all accesses -/// are contiguous along that dimension for now). -/// Build a schedule that maps `mostMinorDim` to a `scf.forall` op. -/// When `numThreads` > 1, the `scf.forall` is also mapped to -/// `mappingAttr` (which must then be non-null). -/// The constructed schedule first performs a split of the largest possible -/// multiple of `numThreads * maxVectorSize` to form a maximally divisible -/// region. -// TODO: More robustness wrt selecting the most minor dimension otherwise -// performance may suffer. -// TODO: Split point should be dynamic and aware of future stride / alignment -// to also guarantee proper vector alignments. OTOH this is a non-trivial bump -// in schedule complexity and can be handled with simple padding of the -// underlying allocation. -void build1DSplittingStrategyWithOptionalThreadMapping( - ImplicitLocOpBuilder &b, Value variantH, Value opH, int64_t rank, - int64_t mostMinorDim, SmallVector opSizes, int64_t numThreads, - Attribute mappingAttr = Attribute(), int64_t maxVectorSize = 4); - -/// Build transform IR to hoist the padded output operand of a padded matmul. -/// Additionally, this attempts to fold the padding into the producing fill, if -/// available. -// TODO: Generalize, this is not specific to a matmul. -// TODO: Better API -Value buildHoistOutputPaddingOp(ImplicitLocOpBuilder &b, Value variantH, - Value paddedMatmulOpH, - int64_t numLoopsToHoist = 1); - -/// Helper function to distribute one pad or copy operation with specified num -/// threads. -/// Note: When `foldIfBranch` is true, one must later perform masked -/// vectorization of the result. -/// This amounts to injecting knowledge about future transformations without -/// adding leaky semantics. -Value buildDistributeOnePadOrCopyWithNumThreads( - ImplicitLocOpBuilder &b, Value variantH, Value copyOpH, - ArrayRef numThreads, ArrayRef threadDimMapping, - bool foldIfBranch = false); - -/// Helper function to distribute one pad or copy operation with specified tile -/// sizes. -/// Note: When `foldIfBranch` is true, one must later perform masked -/// vectorization of the result. -/// This amounts to injecting knowledge about future transformations without -/// adding leaky semantics. -std::tuple buildDistributeOnePadOrCopyWithTileSizes( - ImplicitLocOpBuilder &b, Value variantH, Value copyOpH, - ArrayRef tileSizes, ArrayRef threadDimMapping, - bool foldIfBranch = false); - -/// Distribute the explicit copies involved in a matmul operation -/// `paddedMatmulOpH`. -std::tuple -buildDistributeMatmulCopies(ImplicitLocOpBuilder &b, Value variantH, - Value paddedMatmulOpH, - const AbstractGemmLikeStrategy &strategy); - -/// Specific pattern to perform masked vectorization of copies give as -/// parameters, cleanup and vectorize the rest. -void buildMatmulVectorization(ImplicitLocOpBuilder &b, Value variantH, - Value lhsCopyOpH, Value rhsCopyOpH, - Value copyBackOpH, - const AbstractGemmLikeStrategy &strategy, - bool vectorizePadding = false, - bool vectorizeNdExtract = false); - -/// Build the transform IR to perform conversion to tensor core operations. -/// This is currently subject to phase orderings as follows: -/// - Vector transfer_read and transfer_write patterns have different subview -/// folding behavior, force a fold_memref_aliases on them to enable -/// redundant vector transfer hoisting. -/// - Unfortunately, fold_memref_aliases breaks vector_to_mma conversion -/// across scf.for after unrolling due to insert_strided_slice / -/// extract_strided_slice across iter_args boundaries. -/// - Hoist redundant vector transfers to allow conversion to tensor core to -/// proceed. We really don't want to do this after bufferization but we need -/// to atm. -Value buildConvertToTensorCoreOp(ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy); - -void buildMultiBuffering(ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy); - -Value buildConvertToAsyncCopies(ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy); - -void buildPipelineSharedMemoryCopies(ImplicitLocOpBuilder &b, Value funcH, - const AbstractGemmLikeStrategy &strategy); - -Value buildBufferize(ImplicitLocOpBuilder &b, Value variantH); - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COMMON_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp deleted file mode 100644 index 0af1afac3110..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Transform/IR/TransformTypes.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") - -// TODO: significantly better namespacing. -using iree_compiler::buildPad; -using iree_compiler::buildSelectFirstNonEmpty; -using iree_compiler::buildTileFuseDistToForallWithNumThreads; -using iree_compiler::buildTileFuseDistToForallWithTileSizes; -using iree_compiler::TileToForallAndFuseAndDistributeResult; -using iree_compiler::gpu::buildBufferize; -using iree_compiler::gpu::buildConvertToAsyncCopies; -using iree_compiler::gpu::buildConvertToTensorCoreOp; -using iree_compiler::gpu::buildDistributeMatmulCopies; -using iree_compiler::gpu::buildHoistOutputPaddingOp; -using iree_compiler::gpu::buildMatmulVectorization; -using iree_compiler::gpu::buildMultiBuffering; -using iree_compiler::gpu::buildPipelineSharedMemoryCopies; -using iree_compiler::gpu::ImplicitGemmStrategy; -using iree_compiler::gpu::MappingInfo; -using iree_compiler::gpu::scaleUpByBitWidth; -using iree_compiler::IREE::transform_dialect::ApplyBubbleCollapsePatternsOp; -using iree_compiler::IREE::transform_dialect:: - ApplyFoldReshapeIntoTensorHalInterfacePatternsOp; -using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp; -using iree_compiler::IREE::transform_dialect:: - PopulateWorkgroupCountRegionUsingNumThreadsSliceOp; -using transform::ConvertConv2DToImg2ColOp; -using transform::FuseIntoContainingOp; -using transform::MatchOp; -using transform::TileUsingForOp; -using transform_ext::RegisterMatchCallbacksOp; - -/// Options to set the default values of the matmul strategy. - -void ImplicitGemmStrategy::initDefaultValues(const GPUModel &gpuModel) { - assert(captures.convolutionDims.outputChannel.size() >= 1 && - "requires at least one output channel dimension"); - assert(captures.convolutionDims.inputChannel.size() >= 1 && - "requires at least one input channel dimension"); - assert(captures.convolutionDims.outputImage.size() >= 1 && - "requires at least one output image dimension"); - assert(captures.convolutionDims.filterLoop.size() >= 1 && - "requires at least one filter loop dimension"); - - // It is an NCHW conv if the output channel precedes the output image - // dimensions. - // TODO: This should be inferred directly from the shape of the input (i.e. - // input indexing map) rather than overall iterator classes. - filterLHS = captures.convolutionDims.outputChannel[0] < - captures.convolutionDims.outputImage[0]; - - int64_t channelSize = 1; - for (auto dim : captures.convolutionDims.outputChannel) - channelSize *= captures.convolutionOpSizes[dim]; - int64_t imageSize = 1; - for (auto dim : captures.convolutionDims.outputImage) - imageSize *= captures.convolutionOpSizes[dim]; - - derivedN = channelSize; - derivedM = imageSize; - if (filterLHS) - std::swap(derivedM, derivedN); - - derivedK = 1; - for (auto dim : captures.convolutionDims.filterLoop) - derivedK *= captures.convolutionOpSizes[dim]; - for (auto dim : captures.convolutionDims.inputChannel) - derivedK *= captures.convolutionOpSizes[dim]; - - // TODO: Capture input/output element types properly for configuring the - // padding values. - paddingValueTypes = {captures.inputElementType, captures.filterElementType, - captures.outputElementType}; - paddingDimensions = {0, 1, 2, 3}; - // TODO: Re-enable once padding works with the img2col op. - packingDimensions = - filterLHS ? SmallVector{1, 0, 1} : SmallVector{0, 1, 1}; - - // Pull in tile configs from flags. - AbstractGemmLikeStrategy::initDefaultValues(gpuModel); - - // TODO: Enable async-copies and pipelining - useAsyncCopies = false; - pipelineDepth = 0; -} - -LLVM_DUMP_METHOD void ImplicitGemmStrategy::dump() const { - print(llvm::errs()); -} - -void ImplicitGemmStrategy::print(llvm::raw_ostream &os) const { - os << "\n--- Implicit GEMM strategy ---\n"; - os << "- derived problem shape (MNK): " << m() << ", " << n() << ", " << k() - << '\n'; - os << "- convolution dim types: \n"; - llvm::interleaveComma(captures.convolutionDims.batch, os << "Batch: "); - os << "\n"; - llvm::interleaveComma(captures.convolutionDims.outputImage, - os << "OutputImage: "); - os << "\n"; - llvm::interleaveComma(captures.convolutionDims.outputChannel, - os << "OutputChannel: "); - os << "\n"; - llvm::interleaveComma(captures.convolutionDims.filterLoop, - os << "FilterLoop: "); - os << "\n"; - llvm::interleaveComma(captures.convolutionDims.inputChannel, - os << "InputChannel: "); - os << "\n"; - llvm::interleaveComma(captures.convolutionDims.depth, os << "Depth: "); - os << "\n"; - AbstractGemmLikeStrategy::print(os); -} - -LogicalResult ImplicitGemmStrategy::validate(const GPUModel &gpuModel) const { - // First validate the parent strategy. - if (failed(AbstractGemmLikeStrategy::validate(gpuModel))) - return failure(); - - if (batch() < blockTileBatch()) { - return emitError(UnknownLoc::get(ctx)) - << "batch( " << batch() << ") < blockTileBatch(" << blockTileBatch() - << ") this is at risk of not vectorizing and is NYI"; - } - - if (blockTileSizes.size() < 3) { - LDBG("--Not enough block tile sizes\n"); - return failure(); - } - - if (numWarps.size() < 3) { - LDBG("--Not enough num warps\n"); - return failure(); - } - - if (numThreads.size() < 3) { - LDBG("--Not enough num threads\n"); - return failure(); - } - - if (useFma) - return success(); - - // Currently unrolling is problematic without a unit batch. Fail for now. - if (blockTileBatch() != 1) { - LDBG("--Batch tile size must be 1 for tensor core strategies\n"); - return failure(); - } - - Type lhsElementType = captures.inputElementType; - Type rhsElementType = captures.filterElementType; - Type resElementType = captures.outputElementType; - if (!lhsElementType.isF32() || !rhsElementType.isF32() || - !resElementType.isF32()) { - LDBG("--Tensorcore implicit gemm strategy only supported for f32: " - << lhsElementType << ", " << rhsElementType << ", " << resElementType); - return failure(); - } - if (lhsElementType != rhsElementType) { - LDBG("--Tensorcore implicit gemm strategy mixed input types unsupported\n"); - return failure(); - } - - return success(); -} - -static std::tuple -buildConvolutionStrategyBlockDistribution( - ImplicitLocOpBuilder &b, Value variantH, - const ImplicitGemmStrategy &strategy) { - // Step 1. Call the matcher. Note that this is the same matcher as used to - // trigger this compilation path, so it must always apply. - b.create(); - auto [fillH, convolutionH, maybeTrailingH] = unpackRegisteredMatchCallback<3>( - b, "convolution", transform::FailurePropagationMode::Propagate, variantH); - - // Step 2. Do Img2Col on the convolution to get the GEMM + img2col op. - Type convType = convolutionH.getType(); - auto conv2DToImg2Col = b.create( - TypeRange{convType, convType}, convolutionH); - Value img2colH = conv2DToImg2Col.getImg2colTensor(); - Value transformedH = conv2DToImg2Col.getTransformed(); - - // The matmul is the producer of the transformed handle (expand back to - // convolution shape). - Value matmulH = b.create( - transformedH.getType(), transformedH, 0); - - // Bubble the expand_shape from img2col through the trailing elementwise - Value funcH = b.create(variantH, func::FuncOp::getOperationName()); - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - }); - - // Step 3. Create the block/mapping tiling level and fuse. - auto [fusionTargetH, fusionGroupH] = - buildSelectFirstNonEmpty(b, maybeTrailingH, matmulH); - MappingInfo blockMapping = strategy.getBlockMapping(); - TileToForallAndFuseAndDistributeResult tileResult = - buildTileFuseDistToForallWithTileSizes( - /*builder=*/b, - /*isolatedParentOpH=*/variantH, - /*rootH=*/fusionTargetH, - /*opsToFuseH=*/fusionGroupH, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)), - /*threadDimMapping=*/ - b.getArrayAttr(blockMapping.threadMapping)); - - // Handle the workgroup count region. - b.create( - tileResult.forallH); - - // Rematch the fill because earlier handle is invalidated. - Value newFillH = - b.create(variantH, linalg::FillOp::getOperationName()); - fillH = - b.create(newFillH, tileResult.forallH).getResult(0); - - Value tiledImg2colH = - b.create(img2colH, tileResult.forallH).getResult(0); - - auto [blockMatmulH, maybeBlockTrailingH] = buildSelectFirstNonEmpty( - b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH); - - // TODO: handle trailing op. - return std::make_tuple(fillH, tiledImg2colH, blockMatmulH, - maybeBlockTrailingH, tileResult.forallH); -} - -// TODO: Merge with buildTileFuseToScfFor. -static mlir::iree_compiler::TileToScfForAndFuseResult -buildTileFuseToSingleScfFor(ImplicitLocOpBuilder &b, Value isolatedParentOpH, - Value rootH, Value opHToFuse, - ArrayRef tileSizes) { - iree_compiler::TileToScfForAndFuseResult result; - Type rootType = rootH.getType(); - auto tiletoScfForOp = b.create(rootType, rootH, tileSizes); - result.forLoops = tiletoScfForOp.getLoops(); - result.tiledOpH = tiletoScfForOp.getTiledLinalgOp(); - - assert(result.forLoops.size() == 1 && "More than one loop"); - - // TODO: Allow fusing more than one op. - b.create(opHToFuse, result.forLoops[0]); - // Avoid canonicalization for now to avoid prematurely folding away the pad - // ops. - return result; -} - -void iree_compiler::gpu::buildConvolutionImplicitGemmStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const ImplicitGemmStrategy &strategy) { - LLVM_DEBUG(strategy.print(DBGS())); - - // Step 1. Apply block-level part of the strategy, keeps everything fused. - auto [fillH, img2colH, matmulH, maybeTiledTrailingHBlock, forall] = - buildConvolutionStrategyBlockDistribution(b, variantH, strategy); - // Tile reduction loop. - SmallVector tileSizes{0, 0, 0, strategy.reductionTileSize}; - auto tileReductionResult = - buildTileFuseToSingleScfFor(b, variantH, matmulH, img2colH, tileSizes); - - // Step 2. Pad the matmul op. - auto paddedMatmulOpH = - buildPad(b, tileReductionResult.tiledOpH, - strategy.getZeroPadAttrFromElementalTypes(b).getValue(), - strategy.paddingDimensions, strategy.packingDimensions); - - // Step 3. Hoist the padding of the output operand above the reduction loop. - // The resulting fillOp will be mapped with the contraction using an SIMD - // programming model. - Value fillOpH; - if (!strategy.alignedRes()) { - fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedMatmulOpH); - } else { - fillOpH = b.create(variantH, - linalg::FillOp::getOperationName()); - } - - Value funcH = b.create(variantH, func::FuncOp::getOperationName()); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - - // Step 4. Distribute pad and copies: SIMT programming model. - auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] = - buildDistributeMatmulCopies(b, variantH, paddedMatmulOpH, strategy); - - // Step 5. Distribute to warps: SIMD programming model. - // TODO: get the number of warps from strategy. - MappingInfo computeMapping = strategy.computeMapping(); - buildTileFuseDistToForallWithNumThreads( - b, variantH, paddedMatmulOpH, ValueRange(), - getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)), - b.getArrayAttr(computeMapping.threadMapping)); - buildTileFuseDistToForallWithNumThreads( - b, variantH, fillOpH, ValueRange(), - getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)), - b.getArrayAttr(computeMapping.threadMapping)); - - // Step 6. Rank-reduce and vectorize. - b.create(funcH, [](OpBuilder &b, Location loc) { - b.create(loc); - b.create(loc); - b.create(loc); - }); - buildMatmulVectorization(b, variantH, lhsCopyOpH, rhsCopyOpH, copyBackOpH, - strategy, /*vectorizePadding=*/false, - /*vectorizeNdExtract=*/true); - - // Step 7. Bufferize and drop HAL descriptor from memref ops. - variantH = buildBufferize(b, variantH); - - // Step 8. Post-bufferization mapping to blocks and threads. - // Need to match again since bufferize invalidated all handles. - // TODO: assumes a single func::FuncOp to transform, needs hardening. - // TODO: extract info from strategy. - funcH = b.create(variantH, func::FuncOp::getOperationName()); - funcH = buildMapToBlockAndThreads(b, funcH, strategy.numThreads); - funcH = b.create(funcH); - - // Step 9. Convert to tensor core ops. - // TODO: avoid consuming handles and returning here. - funcH = buildConvertToTensorCoreOp(b, funcH, strategy); - - // TODO: Enable async copies/multibuffering/pipelining. - - // Step 10. Late lowerings and cleanups. - buildLowerVectorMasksAndCleanup(b, funcH); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h deleted file mode 100644 index e17e115abd2f..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_ - -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Support/LogicalResult.h" - -namespace mlir::iree_compiler::gpu { - -struct GPUModel; - -class ImplicitGemmStrategy : public AbstractGemmLikeStrategy { -public: - ImplicitGemmStrategy( - MLIRContext *context, - const transform_ext::MatchedConvolutionCaptures &captures, - const GPUModel &gpuModel) - : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) { - initDefaultValues(gpuModel); - } - - ImplicitGemmStrategy(const ImplicitGemmStrategy &) = default; - ImplicitGemmStrategy &operator=(const ImplicitGemmStrategy &) = default; - - /// Constructor quantities. - MLIRContext *ctx; - transform_ext::MatchedConvolutionCaptures captures; - - /// Initialize values from the CLI. Set cliOptionsSpecified to true if the - /// default CLI values have been overriden. - void initDefaultValues(const GPUModel &gpuModel) override; - - LogicalResult validate(const GPUModel &gpuModel) const override; - - int64_t batch() const { return captures.convolutionOpSizes[0]; } - int64_t m() const override { return derivedM; } - int64_t n() const override { return derivedN; } - int64_t k() const override { return derivedK; } - - /// Named accessors to block tile sizes associated with shapes. - int64_t blockTileBatch() const { return blockTileSizes[0]; } - int64_t blockTileM() const override { return blockTileSizes[1]; } - int64_t blockTileN() const override { return blockTileSizes[2]; } - - /// Number of threads to use. - int64_t numThreadsX() const { return numThreads[0]; } - int64_t numThreadsY() const { return numThreads[1]; } - int64_t numThreadsZ() const { return numThreads[2]; } - - /// Number of warps to use. - int64_t numWarpsX() const override { return numWarps[0]; } - int64_t numWarpsY() const override { return numWarps[1]; } - int64_t numWarpsZ() const { return numWarps[2]; } - - Type getLhsElementalType() const override { - return filterLHS ? captures.filterElementType : captures.inputElementType; - } - Type getRhsElementalType() const override { - return filterLHS ? captures.inputElementType : captures.filterElementType; - } - Type getResElementalType() const override { - return captures.outputElementType; - } - - MappingInfo getBlockMapping() const override { - // 2D named convolutions are always batched. - return MappingInfo{ - /*numThreads=*/{}, - /*tileSizes=*/{blockTileBatch(), blockTileM(), blockTileN()}, - /*threadMapping=*/{blockZ(ctx), blockY(ctx), blockX(ctx)}}; - } - - // LHS copy is of size (batch) x M x K. - MappingInfo lhsCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/k(), - /*copySizes=*/ - filterLHS ? ArrayRef{blockTileM(), reductionTileSize} - : ArrayRef{blockTileBatch(), blockTileM(), - reductionTileSize}, - /*favorPredication=*/false, - /*elementalBitWidth=*/lhsElementalBitWidth()); - } - - // RHS copy is of size (batch) x K x N. - MappingInfo rhsCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/n(), - /*copySizes=*/ - filterLHS ? ArrayRef{blockTileBatch(), reductionTileSize, - blockTileN()} - : ArrayRef{reductionTileSize, blockTileN()}, - /*favorPredication=*/false, - /*elementalBitWidth=*/rhsElementalBitWidth()); - } - - // RES copy is of size batch x M x N. - MappingInfo resCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/n(), - /*copySizes=*/{blockTileBatch(), blockTileM(), blockTileN()}, - /*favorPredication=*/false, - /*elementalBitWidth=*/resElementalBitWidth()); - } - - /// Check that the mapping computed for a copy is valid. - LogicalResult validateLhsCopyMapping() const override { - return validateCopyMapping(ctx, lhsCopyMapping(), "lhs"); - } - LogicalResult validateRhsCopyMapping() const override { - return validateCopyMapping(ctx, rhsCopyMapping(), "rhs"); - } - LogicalResult validateResCopyMapping() const override { - return validateCopyMapping(ctx, resCopyMapping(), "result"); - } - - // COMPUTE is of size batch x M x N. - MappingInfo computeMapping() const override { - if (useFma) { - return MappingInfo{ - /*numThreads=*/{numThreadsZ(), numThreadsY(), numThreadsX()}, - /*tileSizes=*/{}, - /*threadMapping=*/{threadZ(ctx), threadY(ctx), threadX(ctx)}, - /*vectorSize=*/std::nullopt}; - } - return MappingInfo{/*numThreads=*/{numWarpsZ(), numWarpsY(), numWarpsX()}, - /*tileSizes=*/{}, - /*threadMapping=*/{warpZ(ctx), warpY(ctx), warpX(ctx)}, - /*vectorSize=*/std::nullopt}; - } - - void print(llvm::raw_ostream &os) const override; - LLVM_DUMP_METHOD void dump() const override; - -private: - // For NCHW convolutions, the filter will be the LHS of the GEMM. - bool filterLHS = false; - - int64_t derivedM = 0; - int64_t derivedN = 0; - int64_t derivedK = 0; -}; - -void buildConvolutionImplicitGemmStrategy(ImplicitLocOpBuilder &b, - Value variantH, - const ImplicitGemmStrategy &strategy); - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_IMPLICIT_GEMM_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp deleted file mode 100644 index 7d646303bec6..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h" - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-gpu-copy-mapping" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") - -int64_t iree_compiler::gpu::CopyMapping::maxContiguousElementsToTransfer( - int64_t alignment, int64_t numContiguousElements, - int64_t elementalBitWidth) { - assert(kCudaMaxVectorLoadBitWidth % elementalBitWidth == 0 && - "elemental bitwidth does not divide kCudaMaxVectorLoadBitWidth"); - return std::gcd(std::gcd(alignment, numContiguousElements), - kCudaMaxVectorLoadBitWidth / elementalBitWidth); -} - -FailureOr -iree_compiler::gpu::CopyMapping::numThreadsForCopy(int totalNumThreads, - int64_t alignment, - ArrayRef sizes, - bool favorPredication, - int64_t elementalBitWidth) { - LDBG("\nSTART numThreadsForCopy, favorPredication: " << favorPredication); - LLVM_DEBUG(llvm::interleaveComma(sizes, DBGS() << "--sizes: "); - llvm::dbgs() << "\n";); - - // Greedily find the largest vector size that can be used to copy the most - // minor dimension: we are in the business of filling 128B contiguous memory - // transactions with as few threads as possible. - int64_t maxVectorSize = CopyMapping::maxContiguousElementsToTransfer( - alignment, sizes.back(), elementalBitWidth); - LDBG("--maxVectorSize: " << maxVectorSize); - int64_t numElements = 1; - for (auto s : sizes) - numElements *= s; - LDBG("--numElements: " << numElements); - - int64_t actualVectorSize = maxVectorSize; - if (!favorPredication) { - // Bias towards reducing the vector size to avoid predication. - // Predication occurs if we end up using fewer than totalNumThreads for a - // particular copy. - // Predication chokes the current implementation of shared memory - // pipelining. - // TODO: Reevaluate this heuristic when we have a more robust pipelining - // implementation. - for (; actualVectorSize >= 1; actualVectorSize /= 2) { - LDBG("--step totalNumThreads * actualVectorSize: " - << totalNumThreads * actualVectorSize); - if (numElements % (totalNumThreads * actualVectorSize) != 0) - continue; - break; - } - LDBG("--numElements: " << numElements); - LDBG("--totalNumThreads: " << totalNumThreads); - LDBG("--actualVectorSize: " << actualVectorSize); - if (actualVectorSize == 0) { - LDBG("--Could not map copy without predication -> FAIL"); - return failure(); - } - } - - // Scale back the last size by actualVectorSize to account for the fact - // that we perform vector transfers. - assert(sizes.back() % actualVectorSize == 0 && - "most-minor size not divisible by actualVectorSize"); - SmallVector scaledSizes{sizes.begin(), sizes.end()}; - scaledSizes.back() /= actualVectorSize; - - int64_t numThreadsRemaining = totalNumThreads; - LDBG("--numThreadsRemaining: " << numThreadsRemaining); - SmallVector factors; - for (auto s : llvm::reverse(scaledSizes)) { - int64_t gcd = std::gcd(numThreadsRemaining, s); - factors.push_back(gcd); - numThreadsRemaining /= gcd; - LDBG("--new factors: " << gcd); - LDBG("--numThreadsRemaining: " << numThreadsRemaining); - } - - std::reverse(factors.begin(), factors.end()); - - LLVM_DEBUG(llvm::interleaveComma(factors, DBGS() << "numThreads: "); - llvm::dbgs() << "\n"; - LDBG("actualVectorSize: " << actualVectorSize);); - - return CopyMapping{actualVectorSize, factors}; -} - -iree_compiler::gpu::MappingInfo iree_compiler::gpu::CopyMapping::getMappingInfo( - MLIRContext *ctx, int totalNumThreads, int64_t alignment, - ArrayRef copySizes, bool favorPredication, - int64_t elementalBitWidth) { - assert(!copySizes.empty() && copySizes.size() <= 3 && - "only 1,2,3-D copies are supported for now"); - FailureOr maybeCopyMapping = - CopyMapping::numThreadsForCopy(totalNumThreads, alignment, copySizes, - favorPredication, elementalBitWidth); - // If failed, try again with predication; this must succeed. - if (failed(maybeCopyMapping)) { - assert(!favorPredication && - "maybe copy mapping may not fail with predication"); - maybeCopyMapping = CopyMapping::numThreadsForCopy( - totalNumThreads, alignment, copySizes, /*favorPredication=*/true, - elementalBitWidth); - } - assert(succeeded(maybeCopyMapping) && "failed to compute copy mapping"); - assert(maybeCopyMapping->numThreads.size() == copySizes.size() && - "compute copy mapping expected same number of threads and copy sizes"); - - SmallVector tileSizes = llvm::to_vector(llvm::map_range( - llvm::zip(copySizes, maybeCopyMapping->numThreads), [](auto &&pair) { - int64_t size, numThreads; - std::tie(size, numThreads) = pair; - return llvm::divideCeilSigned(size, numThreads); - })); - SmallVector allThreadMappings{linearId2(ctx), linearId1(ctx), - linearId0(ctx)}; - auto threadMapping = - llvm::to_vector(ArrayRef(allThreadMappings).take_back(tileSizes.size())); - - MappingInfo res{/*numThreads=*/maybeCopyMapping->numThreads, - /*tilecopySizes=*/tileSizes, - /*threadMapping=*/threadMapping, - /*vectorSize=*/maybeCopyMapping->vectorSize}; - LLVM_DEBUG(res.print(DBGS()); llvm::dbgs() << "\n"); - return res; -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h deleted file mode 100644 index a72f4d456088..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h +++ /dev/null @@ -1,81 +0,0 @@ - -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_ - -#include - -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h" - -namespace mlir::iree_compiler::gpu { - -struct CopyMapping { - /// Vector size to use for the copy. - int64_t vectorSize; - - /// numThreads to use for the copy mapping, from most major to most minor dims - /// (i.e. numThreads.back() should be mapped to contiguous threads for best - /// coalescing). - SmallVector numThreads; - - /// Determine the maximal vector size to use to copy a contiguous array of - /// `numContiguousElements`, each of bitwidth `elementalBitWidth`. - /// The `alignment` is the number of elements by which the most minor - /// dimension of the copy is aligned. This is an approximation of actual - /// memory alignment after bufferization, for each row of the copy. This is - /// used to restrict the of the copied vector so that it is properly aligned - /// with the requirements of cp.async. If the copy alignemnt does not match - /// the required aligned for a cp.async, thae conversion to cp.async will be - /// skipped. - /// Asserts that `elementalBitWidth` divides `numContiguousElements`. - static int64_t - maxContiguousElementsToTransfer(int64_t alignment, - int64_t numContiguousElements, - int64_t elementalBitWidth = 32); - - /// Compute the number of threads to use to perform a copy of `sizes` - /// elements of `elementalBitWidth`. - /// The `alignment` is the number of elements by which the most minor - /// dimension of the copy is aligned. This is an approximation of actual - /// memory alignment after bufferization, for each row of the copy. This is - /// used to restrict the of the copied vector so that it is properly aligned - /// with the requirements of cp.async. If the copy alignemnt does not match - /// the required aligned for a cp.async, thae conversion to cp.async will be - /// skipped. - /// When `favorPredication` is false, the implementation avoids predication in - /// the copy, even if it means reducing the granularity of the transfer. - /// Otherwise, the implementation will come up with a best-effort predicated - /// mapping that respects the maximal vector transfer size. - static FailureOr - numThreadsForCopy(int totalNumThreads, int64_t alignment, - ArrayRef sizes, bool favorPredication, - int64_t elementalBitWidth = 32); - - /// Greedily compute the MappingInfo to use to perform a copy of `sizes` - /// elements of bitwidth `elementalBitWidth`. - /// The `alignment` is the number of elements by which the most minor - /// dimension of the copy is aligned. This is an approximation of actual - /// memory alignment after bufferization, for each row of the copy. This is - /// used to restrict the of the copied vector so that it is properly aligned - /// with the requirements of cp.async. If the copy alignemnt does not match - /// the required aligned for a cp.async, thae conversion to cp.async will be - /// skipped. When `favorPredication` if false, the mapping is computed to fill - /// all threads with an equal amount of data to copy, so as to avoid - /// predication. Predication often ends up breaking current pipelining - /// implementations down the line and is generally discouraged. At the moment, - /// asserts that sizes has exactly 2 entries. - static MappingInfo getMappingInfo(MLIRContext *ctx, int totalNumThreads, - int64_t alignment, ArrayRef sizes, - bool favorPredication = false, - int64_t elementalBitWidth = 32); -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_COPY_MAPPING_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp deleted file mode 100644 index 8d1c2c3eb108..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace mlir; - -void mlir::iree_compiler::gpu::MappingInfo::print(llvm::raw_ostream &os) const { - os << "MappingInfo{"; - os << "vectorSize: " << ((vectorSize.has_value()) ? vectorSize.value() : 0); - llvm::interleaveComma(numThreads, os << ", numThreads: {"); - llvm::interleaveComma(tileSizes, os << "}, tileSizes: {"); - llvm::interleaveComma(threadMapping, os << "}, threadMapping: {"); - os << "}}"; -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h deleted file mode 100644 index 806da4234a27..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h +++ /dev/null @@ -1,29 +0,0 @@ - -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_ - -#include "mlir/IR/Attributes.h" - -namespace mlir::iree_compiler::gpu { - -/// Helper struct to hold the mapping information for a given operation. -struct MappingInfo { - SmallVector numThreads; - // Note: explicitly computing the tileSizes is only needed until masked - // vectorization properly computes the bounds automatically. - SmallVector tileSizes; - SmallVector threadMapping; - std::optional vectorSize; - void print(llvm::raw_ostream &os) const; - LLVM_DUMP_METHOD void dump() const; -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_MAPPING_INFO_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp deleted file mode 100644 index 4bb56107fd41..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp +++ /dev/null @@ -1,347 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Transform/IR/TransformAttrs.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Transform/IR/TransformTypes.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") - -// TODO: significantly better namespacing. -using iree_compiler::buildPad; -using iree_compiler::buildTileFuseDistToForallWithNumThreads; -using iree_compiler::buildTileFuseDistToForallWithTileSizes; -using iree_compiler::TileToForallAndFuseAndDistributeResult; -using iree_compiler::gpu::BatchMatmulStrategy; -using iree_compiler::gpu::buildBufferize; -using iree_compiler::gpu::buildConvertToAsyncCopies; -using iree_compiler::gpu::buildConvertToTensorCoreOp; -using iree_compiler::gpu::buildDistributeMatmulCopies; -using iree_compiler::gpu::buildHoistOutputPaddingOp; -using iree_compiler::gpu::buildMatmulVectorization; -using iree_compiler::gpu::buildMultiBuffering; -using iree_compiler::gpu::buildPipelineSharedMemoryCopies; -using iree_compiler::gpu::MappingInfo; -using iree_compiler::gpu::MatmulStrategy; -using iree_compiler::gpu::scaleUpByBitWidth; -using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp; -using iree_compiler::IREE::transform_dialect:: - PopulateWorkgroupCountRegionUsingNumThreadsSliceOp; -using transform::MatchOp; -using transform_ext::RegisterMatchCallbacksOp; - -void MatmulStrategy::initDefaultValues(const GPUModel &gpuModel) { - // Set the configuration for padding the matmul. - paddingValueTypes = {captures.lhsElementType, captures.rhsElementType, - captures.outputElementType}; - paddingDimensions = {0, 1, 2}; - packingDimensions = {1, 1, 1}; - - // Pull in tile configs from flags. - AbstractGemmLikeStrategy::initDefaultValues(gpuModel); -} - -LLVM_DUMP_METHOD void MatmulStrategy::dump() const { print(llvm::errs()); } - -void MatmulStrategy::print(llvm::raw_ostream &os) const { - os << "\n--- Matmul strategy ---\n"; - AbstractGemmLikeStrategy::print(os); -} - -LogicalResult MatmulStrategy::validate(const GPUModel &gpuModel) const { - // First validate the parent strategy. - if (failed(AbstractGemmLikeStrategy::validate(gpuModel))) - return failure(); - - // Unlike for wmma/mma, we have no special type requirements for fma. - if (useFma) - return success(); - - Type lhsElementType = captures.lhsElementType; - Type rhsElementType = captures.rhsElementType; - Type resElementType = captures.outputElementType; - if (!lhsElementType.isF32() || !rhsElementType.isF32() || - !resElementType.isF32()) { - LDBG("--Tensorcore matmul strategy only supported for f32: " - << lhsElementType << ", " << rhsElementType << ", " << resElementType); - return failure(); - } - if (lhsElementType != rhsElementType) { - LDBG("--Tensorcore matmul strategy mixed input types unsupported\n"); - return failure(); - } - - if (useMmaSync) { - if (!gpuModel.hasTF32TensorCore) { - LDBG("--Matmul strategy target has not TF32 tensor core\n"); - return failure(); - } - - if (!gpuModel.hasMmaSync) { - LDBG("--Matmul strategy target does not support MMA.SYNC operations\n"); - return failure(); - } - } else { - // Verify WMMA. - // Hard coded to reflect current WMMA unrolling support. - int reqM = 16; - int reqN = 16; - int reqK = lhsElementType.isF32() ? 8 : 16; - if (llvm::all_of(gpuModel.supportedWMMAConfigs, - [&](iree_compiler::gpu::MMAConfig config) { - return config.m != reqM || config.n != reqN || - config.k != reqK || - config.aType != lhsElementType || - config.bType != rhsElementType || - config.cType != resElementType; - })) { - LDBG("--Matmul strategy failed wmma type check\n"); - return failure(); - } - } - return success(); -} - -LogicalResult BatchMatmulStrategy::validate(const GPUModel &gpuModel) const { - if (failed(MatmulStrategy::validate(gpuModel))) { - return failure(); - } - - if (batch() < blockTileBatch()) { - return emitError(UnknownLoc::get(ctx)) - << "batch( " << batch() << ") < blockTileBatch(" << blockTileBatch() - << ") this is at risk of not vectorizing and is NYI"; - } - - // Only single outermost batch dimension is currently supported. - if (captures.batches().size() != 1 || captures.batches().back() != 0) { - LDBG("--Couldn't find single outermost batch dimension\n"); - return failure(); - } - - if (blockTileSizes.size() < 3) { - LDBG("--Not enough block tile sizes\n"); - return failure(); - } - - if (numWarps.size() < 3) { - LDBG("--Not enough num warps\n"); - return failure(); - } - - if (numThreads.size() < 3) { - LDBG("--Not enough num threads\n"); - return failure(); - } - - if (!useFma) { - LDBG("--Only FMA is supported for batch matmul atm\n"); - return failure(); - } - - return success(); -} - -static std::tuple -buildMatmulStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH, - const MatmulStrategy &strategy) { - // Step 1. Call the matcher. Note that this is the same matcher as used to - // trigger this compilation path, so it must always apply. - b.create(); - auto [fillH, matmulH, maybeTrailingH] = unpackRegisteredMatchCallback<3>( - b, "matmul", transform::FailurePropagationMode::Propagate, variantH); - - // Step 2. Create the block/mapping tiling level and fusee. - // auto [fusionTargetH, fusionGroupH] = - // buildSelectFirstNonEmpty(b, maybeTrailingH, matmulH); - MappingInfo blockMapping = strategy.getBlockMapping(); - TileToForallAndFuseAndDistributeResult tileResult = - buildTileFuseDistToForallWithTileSizes( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/matmulH, - /*opsToFuseH=*/fillH, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)), - /*threadDimMapping=*/ - b.getArrayAttr(blockMapping.threadMapping)); - - // Handle the workgroup count region. - b.create( - tileResult.forallH); - - // TODO: handle trailing op. - return std::make_tuple(tileResult.resultingFusedOpsHandles.front(), - tileResult.tiledOpH, Value(), tileResult.forallH); -} - -/// Builds the common part of the schedule for matmuls and batched matmuls. -static void -buildCommonMatmulLikeThreadSchedule(ImplicitLocOpBuilder &b, Value variantH, - Value fillH, Value matmulH, - const MatmulStrategy &strategy) { - using mlir::iree_compiler::buildLowerVectorMasksAndCleanup; - using mlir::iree_compiler::buildTileFuseToScfFor; - using namespace mlir::iree_compiler::gpu; - - // Tile the reduction loop (last in the list). - SmallVector tileSizes(strategy.captures.matmulOpSizes.size() - 1, 0); - tileSizes.push_back(strategy.reductionTileSize); - - // Avoid canonicalizing before the pad to avoid folding away the extract_slice - // on the output needed to hoist the output pad. - auto tileReductionResult = buildTileFuseToScfFor( - b, variantH, matmulH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)), - /*canonicalize=*/false); - - // Step 2. Pad the (batch) matmul op. - auto paddedMatmulOpH = - buildPad(b, tileReductionResult.tiledOpH, - strategy.getZeroPadAttrFromElementalTypes(b).getValue(), - strategy.paddingDimensions, strategy.packingDimensions); - - // Step 3. Hoist the padding of the output operand above the reduction loop. - // The resulting fillOp will be mapped with the contraction using an SIMD - // programming model. - Value fillOpH = fillH; - if (!strategy.alignedRes()) { - fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedMatmulOpH); - } - - // Running canonicalization is required here to enable aligned pads to become - // linalg.copy ops when rewriting in DPS. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH); - - // Step 4. Distribute pad and copies: SIMT programming model. - auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] = - buildDistributeMatmulCopies(b, variantH, paddedMatmulOpH, strategy); - - // Step 5. Distribute to warps: SIMD programming model. - // TODO: get the number of warps from strategy. - MappingInfo computeMapping = strategy.computeMapping(); - buildTileFuseDistToForallWithNumThreads( - b, variantH, paddedMatmulOpH, ValueRange(), - getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)), - b.getArrayAttr(computeMapping.threadMapping)); - buildTileFuseDistToForallWithNumThreads( - b, variantH, fillOpH, ValueRange(), - getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)), - b.getArrayAttr(computeMapping.threadMapping)); - - // Step 6. Rank-reduce and vectorize. - buildMatmulVectorization(b, variantH, lhsCopyOpH, rhsCopyOpH, copyBackOpH, - strategy); - - // Step 7. Bufferize and drop HAL descriptor from memref ops. - variantH = buildBufferize(b, variantH); - - // Step 8. Post-bufferization mapping to blocks and threads. - // Need to match again since bufferize invalidated all handles. - // TODO: assumes a single func::FuncOp to transform, needs hardening. - funcH = b.create(variantH, func::FuncOp::getOperationName()); - funcH = - buildMapToBlockAndThreads(b, funcH, - /*blockSize=*/strategy.numThreads, - /*subgroupSize=*/strategy.targetSubgroupSize); - funcH = b.create(funcH); - - // Step 9. Convert to tensor core ops. - // TODO: avoid consuming handles and returning here. - funcH = buildConvertToTensorCoreOp(b, funcH, strategy); - - // TODO: Support pipelining strategies without async copy (e.g. store to - // shared memory in stage 0). - if (strategy.useAsyncCopies) { - // Step 10. Multi-buffering. - if (strategy.pipelineDepth > 1) - buildMultiBuffering(b, funcH, strategy); - - // Step 11. Convert to async copies. - // TODO: avoid consuming handles and returning here. - funcH = buildConvertToAsyncCopies(b, funcH, strategy); - - // Step 12. Pipeline shared memory copies. - if (strategy.pipelineDepth > 1) - buildPipelineSharedMemoryCopies(b, funcH, strategy); - } - - // Step 13. Late lowerings and cleanups. - buildLowerVectorMasksAndCleanup(b, funcH); -} - -void iree_compiler::gpu::buildMatmulTensorCoreStrategy( - ImplicitLocOpBuilder &b, Value variantH, const MatmulStrategy &strategy) { - LLVM_DEBUG(strategy.print(DBGS())); - - // Step 1. Apply block-level part of the strategy, keeps everything fused. - auto [fillH, matmulH, maybeTiledTrailingHBlock, forall] = - buildMatmulStrategyBlockDistribution(b, variantH, strategy); - buildCommonMatmulLikeThreadSchedule(b, variantH, fillH, matmulH, strategy); -} - -/// Builds the transform dialect operations distributing batch matmul across -/// blocks according to the given strategy. -static std::tuple -buildBatchMatmulStrategyBlockDistribution(ImplicitLocOpBuilder &b, - Value variantH, - const BatchMatmulStrategy &strategy) { - b.create(); - auto [fillH, bmmH] = unpackRegisteredMatchCallback<2>( - b, "batch_matmul", transform::FailurePropagationMode::Propagate, - variantH); - - MappingInfo blockMapping = strategy.getBlockMapping(); - TileToForallAndFuseAndDistributeResult tileResult = - buildTileFuseDistToForallWithTileSizes( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/bmmH, - /*opsToFuseH=*/fillH, - /*tileSizes=*/ - getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)), - /*threadDimMapping=*/ - b.getArrayAttr(blockMapping.threadMapping)); - - // Handle the workgroup count region. - b.create( - tileResult.forallH); - return std::make_tuple(tileResult.resultingFusedOpsHandles.front(), - tileResult.tiledOpH, tileResult.forallH); -} - -void iree_compiler::gpu::buildBatchMatmulStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const BatchMatmulStrategy &strategy) { - LLVM_DEBUG(strategy.print(DBGS())); - - auto [fillH, matmulH, forallH] = - buildBatchMatmulStrategyBlockDistribution(b, variantH, strategy); - buildCommonMatmulLikeThreadSchedule(b, variantH, fillH, matmulH, strategy); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h deleted file mode 100644 index 99c41d05b940..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_ - -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Support/LogicalResult.h" - -namespace mlir::iree_compiler::gpu { - -struct GPUModel; - -class MatmulStrategy : public AbstractGemmLikeStrategy { -public: - MatmulStrategy(MLIRContext *context, - const transform_ext::MatchedMatmulCaptures &captures, - const GPUModel &gpuModel) - : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) { - initDefaultValues(gpuModel); - } - - MatmulStrategy(const MatmulStrategy &) = default; - MatmulStrategy &operator=(const MatmulStrategy &) = default; - - /// Constructor quantities. - MLIRContext *ctx; - transform_ext::MatchedMatmulCaptures captures; - - /// Initialize values from the CLI. Set cliOptionsSpecified to true if the - /// default CLI values have been overriden. - void initDefaultValues(const GPUModel &gpuModel) override; - - LogicalResult validate(const GPUModel &gpuModel) const override; - - int64_t m() const override { - assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes"); - return captures.matmulOpSizes[0]; - } - int64_t n() const override { - assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes"); - return captures.matmulOpSizes[1]; - } - int64_t k() const override { - assert(captures.matmulOpSizes.size() == 3 && "need 3 sizes"); - return captures.matmulOpSizes[2]; - } - - int64_t blockTileM() const override { - assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes"); - return blockTileSizes[0]; - } - int64_t blockTileN() const override { - assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes"); - return blockTileSizes[1]; - } - - int64_t numWarpsX() const override { - assert(numWarps.size() >= 2 && "need at least 2 warp sizes"); - return numWarps[0]; - } - int64_t numWarpsY() const override { - assert(numWarps.size() >= 2 && "need at least 2 warp sizes"); - return numWarps[1]; - } - - Type getLhsElementalType() const override { return captures.lhsElementType; } - Type getRhsElementalType() const override { return captures.rhsElementType; } - Type getResElementalType() const override { - return captures.outputElementType; - } - - MappingInfo getBlockMapping() const override { - return MappingInfo{/*numThreads=*/{}, - /*tileSizes=*/{blockTileM(), blockTileN()}, - /*threadMapping=*/{blockY(ctx), blockX(ctx)}, - /*vectorSize=*/std::nullopt}; - } - - // LHS copy is of size mxk. - MappingInfo lhsCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/k(), - /*copySizes=*/ArrayRef{blockTileM(), reductionTileSize}, - /*favorPredication=*/false, - /*elementalBitWidth=*/lhsElementalBitWidth()); - } - LogicalResult validateLhsCopyMapping() const override { - MappingInfo mapping = lhsCopyMapping(); - // It is fine to use fewer threads to copy the LHS. - if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) { - llvm::errs() << "too many threads used for transferring lhs: " - << mapping.numThreads[0] << " * " << mapping.numThreads[1] - << " > " << totalNumThreads() << "\n"; - return failure(); - } - return success(); - } - - // RHS copy is of size kxn. - MappingInfo rhsCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/n(), - /*copySizes=*/ArrayRef{reductionTileSize, blockTileN()}, - /*favorPredication=*/false, - /*elementalBitWidth=*/rhsElementalBitWidth()); - } - LogicalResult validateRhsCopyMapping() const override { - MappingInfo mapping = rhsCopyMapping(); - // It is fine to use fewer threads to copy the RHS. - if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) { - llvm::errs() << "too many threads used for transferring rhs: " - << mapping.numThreads[0] << " * " << mapping.numThreads[1] - << " > " << totalNumThreads() << "\n"; - return failure(); - } - return success(); - } - - // RES copy is of size mxn. - MappingInfo resCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), - /*alignment=*/n(), - /*copySizes=*/ArrayRef{blockTileM(), blockTileN()}, - /*favorPredication=*/false, - /*elementalBitWidth=*/resElementalBitWidth()); - } - - LogicalResult validateResCopyMapping() const override { - MappingInfo mapping = resCopyMapping(); - // It is fine to use fewer threads to copy the RES. - if (totalNumThreads() < mapping.numThreads[0] * mapping.numThreads[1]) { - llvm::errs() << "too many threads used for transferring res: " - << mapping.numThreads[0] << " * " << mapping.numThreads[1] - << " > " << totalNumThreads() << "\n"; - return failure(); - } - return success(); - } - - // COMPUTE is of size mxn. - MappingInfo computeMapping() const override { - if (useFma) { - // When using FMA we don't need to map to warps, instead just match what - // the copy does. - return CopyMapping::getMappingInfo(ctx, totalNumThreads(), - /*alignment=*/n(), - {blockTileM(), blockTileN()}); - } - return MappingInfo{/*numThreads=*/{numWarpsY(), numWarpsX()}, - /*tileSizes=*/{}, - /*threadMapping=*/{warpY(ctx), warpX(ctx)}, - /*vectorSize=*/std::nullopt}; - } - - void print(llvm::raw_ostream &os) const override; - LLVM_DUMP_METHOD void dump() const override; -}; - -/// An extension of the matmul strategy to batched matrix multiplications. -class BatchMatmulStrategy : public MatmulStrategy { -public: - /// Construct the default strategy, pulling options from the command-line - /// arguments if provided and using the defaults otherwise. - BatchMatmulStrategy(MLIRContext *context, const GPUModel &gpuModel, - const transform_ext::MatchedMatmulCaptures &captures) - : MatmulStrategy(context, captures, gpuModel) { - initDefaultValues(gpuModel); - } - - /// Initialize the default values of the strategy. - void initDefaultValues(const GPUModel &gpuModel) override { - // First, initialize as if this was a simple matmul. - MatmulStrategy::initDefaultValues(gpuModel); - - // Make sure we pad along all dimensions. - paddingDimensions = {0, 1, 2, 3}; - packingDimensions = {1, 1, 1, 1}; - } - - /// Check that the strategy is valid for the captures and the model. - LogicalResult validate(const GPUModel &gpuModel) const override; - - /// Named accessors to shapes. - int64_t batch() const { return captures.matmulOpSizes[0]; } - int64_t m() const override { return captures.matmulOpSizes[1]; } - int64_t n() const override { return captures.matmulOpSizes[2]; } - int64_t k() const override { return captures.matmulOpSizes[3]; } - - /// Named accessors to block tile sizes associated with shapes. - int64_t blockTileBatch() const { return blockTileSizes[0]; } - int64_t blockTileM() const override { return blockTileSizes[1]; } - int64_t blockTileN() const override { return blockTileSizes[2]; } - - /// Number of threads to use. - int64_t numThreadsX() const { return numThreads[0]; } - int64_t numThreadsY() const { return numThreads[1]; } - int64_t numThreadsZ() const { return numThreads[2]; } - - /// Number of warps to use. - int64_t numWarpsX() const override { return numWarps[0]; } - int64_t numWarpsY() const override { return numWarps[1]; } - int64_t numWarpsZ() const { return numWarps[2]; } - - MappingInfo getBlockMapping() const override { - return MappingInfo{ - /*numThreads=*/ - {}, - /*tileSizes=*/{blockTileBatch(), blockTileM(), blockTileN()}, - /*threadMapping=*/{blockZ(ctx), blockY(ctx), blockX(ctx)}, - /*vectorSize=*/std::nullopt}; - } - - // LHS copy is batch x M x K. - MappingInfo lhsCopyMapping() const override { - // TODO: generalize to transpositions, here and below. - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), k(), - {blockTileBatch(), blockTileM(), reductionTileSize}, - /*favorPredication=*/false, - captures.lhsElementType.getIntOrFloatBitWidth()); - } - - // RHS copy is batch x K x N. - MappingInfo rhsCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), n(), - {blockTileBatch(), reductionTileSize, blockTileN()}, - /*favorPredication=*/false, - captures.rhsElementType.getIntOrFloatBitWidth()); - } - - // RES copy is batch x M x N. - MappingInfo resCopyMapping() const override { - return CopyMapping::getMappingInfo( - ctx, totalNumThreads(), n(), - {blockTileBatch(), blockTileM(), blockTileN()}, - /*favorPredication=*/false, - captures.outputElementType.getIntOrFloatBitWidth()); - } - - /// Check that the mapping computed for a copy is valid. - LogicalResult validateLhsCopyMapping() const override { - return validateCopyMapping(ctx, lhsCopyMapping(), "lhs"); - } - LogicalResult validateRhsCopyMapping() const override { - return validateCopyMapping(ctx, rhsCopyMapping(), "rhs"); - } - LogicalResult validateResCopyMapping() const override { - return validateCopyMapping(ctx, resCopyMapping(), "result"); - } - - // Compute is of the size batch x M x N. - MappingInfo computeMapping() const override { - assert(useFma && "only fma is currently supported"); - return MappingInfo{{numThreadsZ(), numThreadsY(), numThreadsX()}, - {}, - {threadZ(ctx), threadY(ctx), threadX(ctx)}, - std::nullopt}; - } -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_MATMUL_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp deleted file mode 100644 index a9f6ed34d8a1..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Transform/IR/TransformTypes.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -// TODO: significantly better namespacing. -using iree_compiler::blockX; -using iree_compiler::blockY; -using iree_compiler::blockZ; -using iree_compiler::buildPad; -using iree_compiler::TileToForallAndFuseAndDistributeResult; -using iree_compiler::gpu::buildBufferize; -using iree_compiler::gpu::buildConvertToAsyncCopies; -using iree_compiler::gpu::buildDistributeOnePadOrCopyWithNumThreads; -using iree_compiler::gpu::buildDistributeOnePadOrCopyWithTileSizes; -using iree_compiler::gpu::PadStrategy; -using iree_compiler::IREE::transform_dialect:: - PopulateWorkgroupCountRegionUsingNumThreadsSliceOp; -using transform::MatchOp; -using transform_ext::RegisterMatchCallbacksOp; - -static llvm::cl::list clBlockTileSizes( - "td-pad-strategy-blk-sizes", - llvm::cl::desc("block tile sizes for dims (x,y,z) for the transform " - "dialect pad strategy"), - llvm::cl::list_init(ArrayRef{64, 64, 1}), - llvm::cl::CommaSeparated); -static llvm::cl::list clNumThreads( - "td-pad-strategy-num-threads", - llvm::cl::desc("number of threads for dims (x,y,z) for the transform " - "dialect pad strategy"), - llvm::cl::list_init(ArrayRef{16, 16, 1}), - llvm::cl::CommaSeparated); -static llvm::cl::list clVectorSize( - "td-pad-strategy-vector-size", - llvm::cl::desc("vector size for the transform dialect pad strategy"), - llvm::cl::list_init(ArrayRef{4, 4}), llvm::cl::CommaSeparated); -static llvm::cl::opt clUseAsyncCopies( - "td-pad-strategy-use-async-copies", - llvm::cl::desc( - "use async copies through shared memory for the pad strategy"), - llvm::cl::init(false)); - -void iree_compiler::gpu::PadStrategy::initDefaultValues() { - blockTileSizes = - SmallVector{clBlockTileSizes.begin(), clBlockTileSizes.end()}; - numThreads = SmallVector{clNumThreads.begin(), clNumThreads.end()}; - vectorSize = SmallVector{clVectorSize.begin(), clVectorSize.end()}; - useAsyncCopies = clUseAsyncCopies; -} - -void iree_compiler::gpu::PadStrategy::configure(GPUModel gpuModel) {} - -static std::tuple -buildPadStrategyBlockDistribution(ImplicitLocOpBuilder &b, Value variantH, - const PadStrategy &strategy) { - // Step 1. Call the matcher. Note that this is the same matcher as used to - // trigger this compilation path, so it must always apply. - b.create(); - auto [padH] = unpackRegisteredMatchCallback<1>( - b, "pad", transform::FailurePropagationMode::Propagate, variantH); - - // Step 2. Create the block/mapping tiling level. - MLIRContext *ctx = b.getContext(); - auto [tiledPadH, forallH] = buildDistributeOnePadOrCopyWithTileSizes( - b, variantH, padH, - /*tileSizes=*/{strategy.blockTileSizeY(), strategy.blockTileSizeX()}, - /*threadDimMapping=*/{blockY(ctx), blockX(ctx)}, /*foldIfBranch=*/true); - - // Step 3.Handle the workgroup count region. - b.create(forallH); - return std::make_tuple(tiledPadH, forallH); -} - -void iree_compiler::gpu::buildPadStrategy(ImplicitLocOpBuilder &b, - Value variantH, - const PadStrategy &strategy) { - MLIRContext *ctx = b.getContext(); - // Step 1. Apply block-level part of the strategy. - auto [padBlockH, forallBlockH] = - buildPadStrategyBlockDistribution(b, variantH, strategy); - - // Step 2. Apply thread-level part of the strategy. - auto padThreadH = buildDistributeOnePadOrCopyWithNumThreads( - b, variantH, padBlockH, - /*numThreads=*/{strategy.numThreadsY(), strategy.numThreadsX()}, - /*threadDimMapping=*/{threadY(ctx), threadX(ctx)}, /*foldIfBranch=*/true); - - // Step 3. Masked vectorization. - SmallVector scalableSizes(strategy.vectorSize.size(), false); - b.create(padThreadH, ValueRange(), - strategy.vectorSize, nullptr, scalableSizes); - - // Step 4. Lower all masked vector transfers at this point, as they make - // canonicalization generate incorrect IR. - // TODO: don't rematch, apply on the variant op directly. - Value funcH = - b.create(variantH, func::FuncOp::getOperationName()); - buildLowerMaskedTransfersAndCleanup(b, funcH); - - // Step 5. Vectorize the rest of func normally. - funcH = buildVectorize(b, funcH, /*applyCleanups=*/true); - - // Step 6. Bufferize and drop HAL descriptor from memref ops. - variantH = buildBufferize(b, variantH); - - // Step 7. Post-bufferization mapping to blocks and threads. - // Need to match again since bufferize invalidated all handles. - // TODO: assumes a single func::FuncOp to transform, needs hardening. - funcH = b.create(variantH, func::FuncOp::getOperationName()); - funcH = buildMapToBlockAndThreads( - b, funcH, - /*blockSize=*/ - {strategy.numThreadsX(), strategy.numThreadsY(), strategy.numThreadsZ()}); - - // TODO: Multi-buffering and async copies in cases where HW supports it. - assert(!strategy.useAsyncCopies && "not implemented yet"); - - // Step 8. Lower masks before returning to the default lowering pipeline. - buildLowerVectorMasksAndCleanup(b, funcH); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h deleted file mode 100644 index 45aa80837676..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_ - -#include - -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" - -namespace mlir::iree_compiler::gpu { - -struct PadConfig {}; - -/// Simple padding strategy. -class PadStrategy : public GPUStrategy { -public: - PadStrategy(MLIRContext *context, - const transform_ext::MatchedPadCaptures &captures, - const PadConfig &config, const GPUModel &gpuModel) - : GPUStrategy(gpuModel), ctx(context), captures(captures) { - initDefaultValues(); - (void)config; - } - - PadStrategy(const PadStrategy &) = default; - PadStrategy &operator=(const PadStrategy &) = default; - - void initDefaultValues(); - void configure(GPUModel gpuModel); - - int64_t blockTileSizeX() const { return blockTileSizes[0]; } - int64_t blockTileSizeY() const { return blockTileSizes[1]; } - int64_t blockTileSizeZ() const { return blockTileSizes[2]; } - int64_t numThreadsX() const { return numThreads[0]; } - int64_t numThreadsY() const { return numThreads[1]; } - int64_t numThreadsZ() const { return numThreads[2]; } - - /// Constructor quantities. - MLIRContext *ctx; - transform_ext::MatchedPadCaptures captures; - - /// Tile sizes for the workgroup / determines grid size for all known - /// reduction strategies. - SmallVector blockTileSizes; - SmallVector numThreads; - SmallVector vectorSize; - // TODO: implement this case. - bool useAsyncCopies = false; -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_PAD_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp deleted file mode 100644 index 071ab9715eef..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.cpp +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -// TODO: significantly better namespacing. -using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp; -using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op; -using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp; -using transform::FuseIntoContainingOp; -using transform::MatchOp; -using transform::ScalarizeOp; -using transform::SequenceOp; -using transform_ext::MatchCallbackOp; -using transform_ext::RegisterMatchCallbacksOp; -using transform_ext::StructuredOpMatcher; - -using iree_compiler::AbstractReductionStrategy; -using iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle; -using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping; -using iree_compiler::gpu::buildCommonTrailingStrategy; -using iree_compiler::gpu::buildDistributeVectors; -using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth; -using iree_compiler::gpu::ReductionConfig; -using iree_compiler::gpu::scaleUpByBitWidth; -using iree_compiler::gpu::SmallReductionStrategy; -using iree_compiler::gpu::threadX; -using iree_compiler::gpu::threadY; -using iree_compiler::gpu::threadZ; - -mlir::iree_compiler::gpu::SmallReductionStrategy::SmallReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig, const GPUModel &gpuModel) - : AbstractReductionStrategy(captures, {}), GPUStrategy(gpuModel) { - configure(reductionConfig); - LLVM_DEBUG(DBGS() << "use GPU small reduction strategy\n"); - LLVM_DEBUG(llvm::interleaveComma(workgroupTileSizes, - DBGS() << "--workgroupTileSizes: "); - llvm::dbgs() << "\n"); -} - -void mlir::iree_compiler::gpu::SmallReductionStrategy::configure( - const ReductionConfig &reductionConfig) { - int64_t maxNumThreadsToUse = reductionConfig.maxNumThreads; - assert(maxNumThreadsToUse > 0 && "maxNumThreadsToUse must be > 0"); - assert(maxNumThreadsToUse >= subgroupSize && "not even a warp?"); - - // Block-level - // =========== - // TODO: capture more dims than just the most minor parallel and have a more - // powerful `maybeDivisor` evaluation. - int64_t mostMinorParallelDimensionSize = - ArrayRef(captures.reductionOpSizes).drop_back().back(); - FailureOr maybeDivisor = maxDivisorOfValueBelowLimit( - mostMinorParallelDimensionSize, maxNumThreadsToUse); - - // Trailing elementwise unaligned tiling created bounded local buffers that - // are dynamic. Attempting to bound them in Common/PadDynamicAlloc.cpp results - // in a crash in the associated upstream util. - // TODO: More generally fix PadDynamicAlloc and the associated upstream util. - bool hasTrailingElementwise = (captures.maybeTrailingRank > 0); - if (failed(maybeDivisor) && hasTrailingElementwise) - maybeDivisor = 1; - - // If the captured dimension has no satisfactory divisor, just tile the last - // parallel dimension by 2 * subgroupSize. - int64_t numParallelLoops = captures.reductionRank - 1; - workgroupTileSizes.append(numParallelLoops, 1); - workgroupTileSizes.back() = - hasTrailingElementwise - ? *maybeDivisor - : std::min((int64_t)maxNumThreadsToUse, (int64_t)(2 * subgroupSize)); - - // Thread-level - // ============ - // Just running sequentially on each thread and relying on cache for - // locality. -} - -static void buildSmallReductionStrategyThreadDistribution( - ImplicitLocOpBuilder &b, Value variantH, Value maybeLeadingH, Value fillH, - Value reductionH, Value maybeTrailingH, - const SmallReductionStrategy &strategy) { - auto [fusionTargetH, fusionGroupH] = - iree_compiler::buildSelectFirstNonEmpty(b, maybeTrailingH, reductionH); - MLIRContext *ctx = b.getContext(); - SmallVector threadDimMapping{threadX(ctx), threadY(ctx), - threadZ(ctx)}; - threadDimMapping.resize(strategy.workgroupTileSizes.size()); - iree_compiler::TileToForallAndFuseAndDistributeResult tileResult = - iree_compiler::buildTileFuseDistToForallWithNumThreads( - /*builder=*/b, - /*variantH=*/variantH, - /*rootH=*/fusionTargetH, - /*opsToFuseH=*/fusionGroupH, - /*numThreads=*/ - getAsOpFoldResult(b.getI64ArrayAttr(strategy.workgroupTileSizes)), - /*threadDimMapping=*/b.getArrayAttr(threadDimMapping)); - fillH = - b.create(fillH, tileResult.forallH).getFusedOp(); - maybeLeadingH = - b.create(maybeLeadingH, tileResult.forallH) - .getFusedOp(); - - // 1. Scalarize all ops to ensure vectorization. - auto anyOpType = transform::AnyOpType::get(b.getContext()); - fillH = b.create(anyOpType, fillH); - maybeLeadingH = b.create(anyOpType, maybeLeadingH); - Value tiledH = b.create(anyOpType, tileResult.tiledOpH); - Value fusedH = b.create( - anyOpType, tileResult.resultingFusedOpsHandles.front()); - auto [blockReductionH, maybeBlockTrailingH] = - iree_compiler::buildSelectFirstNonEmpty(b, fusedH, tiledH); - - // 2. Apply the 1d splitting strategy to the reduction part while specifying - // a single thread. This triggers the splitting but not the thread mapping - // part. - build1DSplittingStrategyWithOptionalThreadMapping( - /*b=*/b, - /*variantH=*/variantH, - /*opH=*/blockReductionH, - /*rank=*/strategy.captures.reductionRank, - // TODO: capture and generalize mostMinorDim. - /*mostMinorDim=*/strategy.captures.reductionRank - 1, - /*opSizes=*/strategy.captures.reductionOpSizes, - /*numThreads=*/1); - - // 3. Apply the 1d splitting strategy to the trailing elementwise part while - // specifying a single thread. This triggers the splitting but not the thread - // mapping part. - build1DSplittingStrategyWithOptionalThreadMapping( - /*b=*/b, - /*variantH=*/variantH, - /*opH=*/maybeBlockTrailingH, - /*rank=*/strategy.captures.maybeTrailingRank, - // TODO: capture and generalize mostMinorDim. - /*mostMinorDim=*/strategy.captures.maybeTrailingRank - 1, - /*opSizes=*/strategy.captures.trailingOpSizes, - /*numThreads=*/1); -} - -void mlir::iree_compiler::gpu::buildSmallReductionStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const SmallReductionStrategy &strategy) { - // Step 1. Apply block-level part of the strategy, keeps everything fused. - ArrayRef workgroupTileSizes{strategy.workgroupTileSizes}; - auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock, - forall] = - buildReductionStrategyBlockDistribution( - b, variantH, - workgroupTileSizes.take_front(strategy.captures.reductionRank - 1)); - - // Step 2. Apply thread-level part of the strategy, keeps everything fused. - buildSmallReductionStrategyThreadDistribution( - b, variantH, maybeLeadingHBlock, gridFillH, gridReductionH, - maybeTiledTrailingHBlock, strategy); - - // Step 3-4. Common trailing steps. - buildCommonTrailingStrategy(b, variantH, strategy.getNumThreadsInBlock()); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h deleted file mode 100644 index 79a76db6b0dd..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_ - -#include - -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" - -namespace mlir::iree_compiler::gpu { - -/// Encode a strategy targeted at (very) small reductions, for which other -/// strategies perform poorly. -/// -/// In the case of small reductions, we cannot make an efficient use of warp -/// shuffles. Instead, take advantage of caches. -/// This strategy aims at running the reduction sequentially within each -/// thread and taking parallelism from outer dimensions that we would -/// otherwise use for block-level parallelism. -/// -/// There are 2 cases: -/// 1. we can find good divisors of outer parallel dimensions and avoid -/// creating dynamic tile sizes. We can then vectorize to the reduction -/// size. -/// 2. we cannot find good divisors, we pay the price of dynamic loops. -/// -// TODO: Refine 1. with linalg splitting on the reduction dimension. -// TODO: Refine 2. with linalg splitting on the parallel dimension. -// -// Note: All this is to be able to handle very small and small-ish -// reductions without catastrophic regressions. -// TODO: Add another strategy based on segmented scans, which can allow us -// to force sizes that don't divide properly into warp shuffles. -class SmallReductionStrategy : public AbstractReductionStrategy, GPUStrategy { -public: - SmallReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig, const GPUModel &gpuModel); - - SmallReductionStrategy(const SmallReductionStrategy &) = default; - SmallReductionStrategy &operator=(const SmallReductionStrategy &) = default; - - std::array getNumThreadsInBlock() const { - std::array res{1, 1, 1}; - for (int64_t i = 0, e = workgroupTileSizes.size(); i < e; ++i) - res[i] = workgroupTileSizes[i]; - return res; - } - -private: - /// Compute the small strategy based on the problem size and the - /// `maxNumThreadsToUse`. - void configure(const ReductionConfig &reductionConfig); -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_SMALL_REDUCTION_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp deleted file mode 100644 index fbe4b11e27e5..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h" - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") - -// TODO: significantly better namespacing. -using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp; -using iree_compiler::IREE::transform_dialect::ShareForallOperandsOp; -using iree_compiler::IREE::transform_dialect::VectorToWarpExecuteOnLane0Op; -using iree_compiler::IREE::transform_dialect::VectorWarpDistributionOp; -using transform::FuseIntoContainingOp; -using transform::MatchOp; -using transform::ScalarizeOp; -using transform::SequenceOp; -using transform_ext::StructuredOpMatcher; - -using iree_compiler::buildTileReductionUsingScfForeach; -using iree_compiler::gpu::adjustNumberOfWarpsForBlockShuffle; -using iree_compiler::gpu::build1DSplittingStrategyWithOptionalThreadMapping; -using iree_compiler::gpu::buildCommonTrailingStrategy; -using iree_compiler::gpu::buildDistributeVectors; -using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth; -using iree_compiler::gpu::ReductionConfig; -using iree_compiler::gpu::scaleUpByBitWidth; -using iree_compiler::gpu::StagedReductionStrategy; -using iree_compiler::gpu::threadX; -using iree_compiler::gpu::threadY; - -mlir::iree_compiler::gpu::StagedReductionStrategy::StagedReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig, const GPUModel &gpuModel) - : AbstractReductionStrategy(captures, {}), GPUStrategy(gpuModel) { - configure(reductionConfig); - LLVM_DEBUG(DBGS() << "use GPU staged reduction strategy\n"); - LLVM_DEBUG(llvm::interleaveComma(workgroupTileSizes, - DBGS() << "--workgroupTileSizes: "); - llvm::dbgs() << "\n"); -} - -void mlir::iree_compiler::gpu::StagedReductionStrategy::configure( - const ReductionConfig &reductionConfig) { - int64_t maxNumThreadsToUse = reductionConfig.maxNumThreads; - int64_t maxVectorSize = reductionConfig.vectorSize; - assert(maxNumThreadsToUse > 0 && "maxNumThreadsToUse must be > 0"); - assert(maxNumThreadsToUse >= subgroupSize && "need at least a warp?"); - assert(maxVectorSize > 0 && "maxVectorSize must be > 0"); - - // Block-level - // =========== - // Tile all the parallel dimensions to 1 and create many blocks. - // TODO: Investigate taking some sizes that divide the dimensions and make - // the kernel meatier. - int64_t numParallelLoops = captures.reductionRank - 1; - workgroupTileSizes.append(numParallelLoops, 1); - - // Thread-level - // ============ - // Stage 1 - // ------- - // Maximal vector size that divides the problem size. - // TODO: Split to ensure 4 on most of the problem and use a 1-epilogue. - int64_t reductionDimensionSize = captures.reductionOpSizes.back(); - // Tile reduction to the maximal multiple `vectorSize` allowed. - // This locally reduces the large unknown reduction into a guaranteed - // multiple of `vectorSize`. - if (ShapedType::isDynamic(reductionDimensionSize)) { - // In the dynamic case, always run vector size of 1 and pad to the maximal - // warp size below the `maxNumThreadsToUse` limit. - vectorSize = 1; - numThreadsXInBlock = - iree_compiler::previousMultipleOf(maxNumThreadsToUse, subgroupSize); - } else { - // Adjust the vector size to the max power of 2 that divides the reduction, - // this dimensions the vector properly, whatever the elemental type. - assert((maxVectorSize & (maxVectorSize - 1)) == 0 && - "maxVectorSize must be a power of 2"); - // TODO: we could also split out the first multiple of vectorSize instead - // of reducing the vectorSize. This is better done with future stride / - // alignment in mind. - // TODO: splitting here also requires the post-bufferization privatization - // analysis (see #11715). - for (vectorSize = maxVectorSize; vectorSize > 1; vectorSize >>= 1) - if (reductionDimensionSize % vectorSize == 0) - break; - // Pad to the next multiple of the warp size above - // `reductionDimensionSize / vectorSize` but below `maxNumThreadsToUse`. - numThreadsXInBlock = std::min( - iree_compiler::nextMultipleOf(reductionDimensionSize / vectorSize, - subgroupSize), - iree_compiler::previousMultipleOf(maxNumThreadsToUse, subgroupSize)); - } -} - -static Value shareForeachArgument(ImplicitLocOpBuilder &b, Value Forall, - ArrayRef indices) { - auto foreachType = transform::OperationType::get( - b.getContext(), scf::ForallOp::getOperationName()); - Forall = b.create(foreachType, Forall); - return b - .create( - foreachType, Forall, indices); -} - -static void buildStagedReductionStrategyThreadLevel( - ImplicitLocOpBuilder &b, Value variantH, Value gridReductionH, - Value gridFillH, Value maybeTiledLeadingH, Value maybeTiledTrailingH, - const StagedReductionStrategy &strategy) { - MLIRContext *ctx = b.getContext(); - // Map the potential maybeTiledLeadingH. - // TODO: Consider fusing leading elementwise into threads. - if (strategy.captures.maybeLeadingRank > 0) { - int64_t vectorSize = - kCudaMaxVectorLoadBitWidth / - strategy.captures.maybeLeadingOutputElementalTypeBitWidth; - assert((vectorSize & (vectorSize - 1)) == 0 && "size must be power of 2"); - build1DSplittingStrategyWithOptionalThreadMapping( - /*b=*/b, - /*variantH=*/variantH, - /*opH=*/maybeTiledLeadingH, - /*rank=*/strategy.captures.maybeLeadingRank, - // TODO: capture and generalize mostMinorDim. - /*mostMinorDim=*/strategy.captures.maybeLeadingRank - 1, - /*opSizes=*/strategy.captures.leadingOpSizes, - /*numThreads=*/strategy.getNumThreadsInBlock().front(), - /*mappingAttr=*/threadX(ctx), - /*maxVectorSize=*/vectorSize); - } - - // Staged reduction step 1: break gridReductionH apart. - auto [blockParallelForallOp, blockParallelFillH, blockCombinerOpH] = - buildTileReductionUsingScfForeach( - /*b=*/b, - /*isolatedParentOpH=*/variantH, - /*reductionH=*/gridReductionH, - /*reductionRank=*/strategy.captures.reductionRank, - /*tileSize=*/strategy.getNumThreadsInBlock().front(), - /*reductionVectorSize=*/strategy.getVectorSize(), - /*mappingAttr=*/threadX(ctx)); - - // Staged reduction step 2: multi-warp shuffle reduce. - // Map the combiner reduction to one thread along y. Mapping this part along - // y only will trigger the insertion of an `scf.if (threadIdx.x == 0)` - // predicate after `scf.forall` is lowered. - // This predicate allows further vector distribution to kick in. - Value root = blockCombinerOpH; - SmallVector opsToFuse = {gridFillH}; - - // By the properties matching, we know the optional trailing op takes the - // result of the reduction as an input argument. - // It necessarily follows that maybeTrailingRank >= reductionRank - 1. - // When maybeTrailingRank == reductionRank - 1, by the properties of the - // transformations we have applied until now, we know that the elementwise is - // a simple scalar operation and it can be fused in the producing reduction - // without creating recomputations. - // TODO: Some `transform.assert` op that the shape of the op is indeed 1s only - // as a safety measure. - // TODO: More composable transform strategy parts require more matching after - // part of the strategy has been applied. See the discussion in #11951 for - // more context. - if (strategy.captures.maybeTrailingRank == - strategy.captures.reductionRank - 1) { - root = maybeTiledTrailingH; - opsToFuse.push_back(blockCombinerOpH); - } - iree_compiler::buildTileFuseDistToForallWithTileSizes( - /*b=*/b, - /*variantH=*/variantH, - /*rootH=*/root, - /*opsToFuse=*/opsToFuse, - /*tileSizes=*/getAsOpFoldResult(b.getI64ArrayAttr({1})), - /*mappingAttr=*/b.getArrayAttr(threadY(ctx))); - - // Map the potential maybeTiledTrailingH if it hasn't been fused with the - // reduction. - if (root != maybeTiledTrailingH && strategy.captures.maybeTrailingRank > 0) { - int64_t vectorSize = - iree_compiler::gpu::kCudaMaxVectorLoadBitWidth / - strategy.captures.maybeTrailingOutputElementalTypeBitWidth; - build1DSplittingStrategyWithOptionalThreadMapping( - /*b=*/b, - /*variantH=*/variantH, - /*opH=*/maybeTiledTrailingH, - /*rank=*/strategy.captures.maybeTrailingRank, - // TODO: capture and generalize mostMinorDim. - /*mostMinorDim=*/strategy.captures.maybeTrailingRank - 1, - /*opSizes=*/strategy.captures.trailingOpSizes, - /*numThreads=*/strategy.getNumThreadsInBlock().front(), - /*mappingAttr=*/threadX(ctx), - /*maxVectorSize=*/vectorSize); - } -} - -/// Builds the transform IR tiling reductions for CUDA targets. Supports -/// reductions in the last dimension, with optional leading and trailing -/// elementwise operations. -void mlir::iree_compiler::gpu::buildStagedReductionStrategy( - ImplicitLocOpBuilder &b, Value variantH, - const StagedReductionStrategy &strategy) { - // Step 1. Match and tile to introduce the top-level scf.forall for - // the block/workgroup level. Keep everything fused. - ArrayRef workgroupTileSizes{strategy.workgroupTileSizes}; - auto [maybeLeadingHBlock, gridFillH, gridReductionH, maybeTiledTrailingHBlock, - commonEnclosingForallH] = - buildReductionStrategyBlockDistribution( - b, variantH, - workgroupTileSizes.take_front(strategy.captures.reductionRank - 1)); - - // Step 2. Split the reduction and tile the pieces to ensure vector - // load/stores and mapping to a single warp with shuffles. - buildStagedReductionStrategyThreadLevel(b, variantH, gridReductionH, - gridFillH, maybeLeadingHBlock, - maybeTiledTrailingHBlock, strategy); - - // Step 3. Make sure we don't create allocation by sharing forall - // output. This amounts to injecting user-defined static information that each - // thread accesses only a private slice. This needs to be added late, once we - // don't need handles anymore, because contained handles are currently always - // invalidated, even when modified inplace. - // TODO: Relax nested invalidation for transforms that only move or modify - // contained ops inplace. - shareForeachArgument(b, commonEnclosingForallH, ArrayRef({0})); - - // Step 4-5. Common trailing steps. - auto [variantH2, funcH] = - buildCommonTrailingStrategy(b, variantH, strategy.getNumThreadsInBlock()); - - // Step 6. The staged strategy has a post-bufferization vector distribution - // with rank-reduction. The vector distribution occurs on multiple warps and - // is itself internally staged in 2 stages. - // Distribute the reduction on all the threads of the group. This allows us - // to have the same data layout for the partial reduction and the merge and - // therefore we can optimize away the temporary memory usage. - buildDistributeVectors(b, variantH2, funcH, strategy.getTotalNumThreads()); - - // Step 7. Apply clean up of memory operations. - funcH = b.create(variantH2, func::FuncOp::getOperationName()); - iree_compiler::buildMemoryOptimizations(b, funcH); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h deleted file mode 100644 index c76195c19f8f..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_ - -#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" - -namespace mlir::iree_compiler::gpu { - -/// Encode a 3-staged strategy for a 1-d reduction mapped to a block. -/// -/// This happens in a staged fashion to encode good tradeoffs between amount -/// of parallelism, occupancy and granularity of the load/store operations. -/// The tradeoff is controlled at a distance by specifying a -/// `maxNumThreadsToUse` upper bound. -/// -/// Bottom-up perspective: -/// ====================== -/// Stage 3: second stage of the the warp shuffle step reduces a vector -/// element to a single element. Only threadIdx == 0 commits to memory. -/// -/// Stage 2: the second stage of the reduction is the first stage of the warp -/// shuffle step. It is normalized to reduce from a "k-warps" abstraction, -/// across all warps in parallel, to a k-element result. Only the first thread -/// within each warp (e.g. threadIdx % subgroupSize == 0) commits to memory. -/// -/// Stage 1: the first stage of the reduction is normalized to run on "k-warps" -/// of maximal vector size for both the hardware and the problem sizes. -/// The over-provisioning to "k-warps" allows multiple warps to run in parallel. -/// The `numThreadsXInBlock` is this "k-warps" quantity and is also the -/// number of threads (i.e. blockDim.x) used to parallelize the problem. -/// This also results in `numThreadsXInBlock` live values that are -/// allocated in shared memory and creates a tradeoff between parallelism and -/// occupancy. -/// The normalization guarantees that whatever the problem size P, we reduce -/// from `tensor

` to `tensor` by using the -/// largest possible `vector.transfer` operations. The vector size is chosen as -/// follows: when the `reductionDimensionSize` is a multiple of 4, choose 4; -/// otherwise try with 2; otherwise just use 1. -// -// TODO: Split to ensure 4 on most of the problem and use a 1-epilogue. This is -// best done if we can encode the future stride to ensure the 4 is aligned. -class StagedReductionStrategy : public AbstractReductionStrategy, GPUStrategy { -public: - StagedReductionStrategy( - const transform_ext::MatchedReductionCaptures &captures, - const ReductionConfig &reductionConfig, const GPUModel &targetGpu); - - StagedReductionStrategy(const StagedReductionStrategy &) = default; - StagedReductionStrategy &operator=(const StagedReductionStrategy &) = default; - - std::array getNumThreadsInBlock() const { - return {numThreadsXInBlock, 1, 1}; - } - - int64_t getVectorSize() const { return vectorSize; } - - int64_t getNumWarps() const { - assert(numThreadsXInBlock % subgroupSize == 0 && - "staged reduction strategy requires full warps"); - return numThreadsXInBlock / subgroupSize; - } - - int64_t getTotalNumThreads() const { return numThreadsXInBlock; } - -private: - /// Compute the staged strategy based on the reductionDimensionSize, the - /// `maxNumThreadsToUse` and the `vectorSize`. - /// The latter 2 numbers control the tradeoff between parallelism and shared - /// memory consumption. - // TODO: Characterize shared memory consumption and limit for good occupancy. - void configure(const ReductionConfig &reductionConfig); - - /// Maximal vector size (among {1, 2, 4}) that divides the - /// `reductionDimensionSize` and is used for vector transfers in Stage 1. - int64_t vectorSize; - - /// Maximal "k-warp" size within the limits of the `maxNumThreadsToUse` and - /// `reductionDimensionSize` parameters. - /// This is also the blockDim.x of the kernel. - int64_t numThreadsXInBlock; -}; - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STAGED_REDUCTION_STRATEGY_H_ diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp deleted file mode 100644 index 770091bda511..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp +++ /dev/null @@ -1,814 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" - -#include - -#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h" -#include "iree-dialects/Transforms/TransformMatchers.h" -#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h" -#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h" -#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/StagedReductionStrategy.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" -#include "mlir/IR/TypeUtilities.h" - -using namespace mlir; - -#define DEBUG_TYPE "iree-transform-builder" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(llvm::dbgs() << '[' << DEBUG_TYPE << "] " << X) - -llvm::cl::opt clGPUEnableTransformDialectMatmulTensorCoreStrategy( - "iree-codegen-llvmgpu-enable-transform-dialect-matmul-tensorcore-strategy", - llvm::cl::desc("activate the matmul tensorcore strategy"), - llvm::cl::init(true)); -llvm::cl::opt clGPUEnableTransformDialectImplicitGemmStrategy( - "iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy", - llvm::cl::desc("activate the convolution implicit gemm strategy"), - llvm::cl::init(false)); -llvm::cl::opt clGPUEnableTransformDialectAlignedMatmul( - "iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul", - llvm::cl::desc( - "activate the matmul tensorcore strategy for tile aligned shapes"), - llvm::cl::init(false)); -llvm::cl::opt clGPUEnableTransformDialectSmallMatmul( - "iree-codegen-llvmgpu-enable-transform-dialect-small-matmul", - llvm::cl::desc("activate the matmul tensorcore strategy for small shapes " - "(< 16) in at least a dimension"), - llvm::cl::init(false)); -llvm::cl::opt clGPUEnableTransformDialectPadStrategy( - "iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy", - llvm::cl::desc("activate the pad strategy"), llvm::cl::init(false)); -llvm::cl::opt clGPUEnableTransformDialectBatchMatmulStrategy( - "iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy", - llvm::cl::desc("activate the batch matmul strategy, additional " - "configuration flags are shared with matmul"), - llvm::cl::init(false)); - -// TODO: significantly better namespacing. -using iree_compiler::gpu::AbstractGemmLikeStrategy; -using iree_compiler::gpu::BatchMatmulStrategy; -using iree_compiler::gpu::GPUModel; -using iree_compiler::gpu::ImplicitGemmStrategy; -using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth; -using iree_compiler::gpu::MatmulStrategy; -using iree_compiler::gpu::PadConfig; -using iree_compiler::gpu::PadStrategy; -using iree_compiler::gpu::ReductionConfig; -using iree_compiler::gpu::ReductionStrategy; -using iree_compiler::gpu::scaleUpByBitWidth; -using iree_compiler::gpu::SmallReductionStrategy; -using iree_compiler::gpu::StagedReductionStrategy; -using transform_ext::CapturingOpMatcher; -using transform_ext::MatchCallbackOp; -using transform_ext::MatchedMatmulCaptures; -using transform_ext::MatchedPadCaptures; -using transform_ext::MatchedReductionCaptures; -using transform_ext::MatcherContext; -using transform_ext::RegisterMatchCallbacksOp; -using transform_ext::StructuredOpMatcher; - -//===----------------------------------------------------------------------===// -// Higher-level problem-specific strategy creation APIs, these should favor -// user-friendliness. -//===----------------------------------------------------------------------===// - -//===--------------------------------------------------------------------===// -// Reduction strategies. -//===--------------------------------------------------------------------===// -/// Placeholder to encode fixed reductions that should take finer-grained -/// precedence over other heuristics. In the future, this could be lifted to -/// e.g. `gpuModel` or higher up in some transform dialect database summary of -/// "known good things". -static FailureOr applyKnownGoodReductionConfigurations( - const transform_ext::MatchedReductionCaptures &captures, - const GPUModel &gpuModel) { - auto staged = ReductionStrategy::Staged; - int64_t reductionSize = captures.reductionOpSizes.back(); - if (gpuModel.model == GPUModel::kDefaultGPU) { - if (captures.reductionOutputElementalTypeBitWidth == 32) { - if (reductionSize == 64) - return ReductionConfig{/*maxNumThreads=*/64, /*vectorSize=*/1, staged}; - if (reductionSize == 128) - return ReductionConfig{/*maxNumThreads=*/32, /*vectorSize=*/4, staged}; - if (reductionSize == 512) - return ReductionConfig{/*maxNumThreads=*/256, /*vectorSize=*/2, staged}; - } - } - return failure(); -} - -/// The configurations below have been determined empirically by performing a -/// manual tradeoff between problem size, amount of parallelism and vector -/// size on a particular NVIDIA RTX2080Ti 12GB card. This is a coarse tradeoff -/// that should generally give reasonably good results but that begs to be -/// complemented by hardcoded known good configurations and ultimately a -/// database and/or a random forest compression of configurations with -/// guaranteed performance. -// TODO: Lift some of the strategy sizing logic as hints and/or heuristics to -// also work properly in the dynamic case. -// TODO: Support more HW configs and make it more pluggable. -static ReductionConfig -getReductionConfig(const transform_ext::MatchedReductionCaptures &captures, - const GPUModel &gpuModel) { - auto maybeHardcodedConfiguration = - applyKnownGoodReductionConfigurations(captures, gpuModel); - if (succeeded(maybeHardcodedConfiguration)) - return *maybeHardcodedConfiguration; - - //===--------------------------------------------------------------------===// - // Small reduction strategy. - //===--------------------------------------------------------------------===// - // Dynamic reductions are never supported by default because we can - // never know offhand whether we are in a small-reduction regime mode. - // Since this mode does not coalesce reads, perf will suffer - // catastrophically on larger runtime reduction. - // TODO: explicit hint from above that we really want to do that. - int64_t redSize = captures.reductionOpSizes.back(); - bool isDynamicReduction = ShapedType::isDynamic(redSize); - // Otherwise, still only support the small cases for now and fall back to - // other strategies otherwise. - bool isSmallReduction = (redSize < 2 * gpuModel.subgroupSize); - if (!isDynamicReduction && isSmallReduction) { - int64_t maxNumThreads = 4 * gpuModel.subgroupSize; - return ReductionConfig{maxNumThreads, 0, ReductionStrategy::Small}; - } - - //===--------------------------------------------------------------------===// - // Staged reduction strategy. - //===--------------------------------------------------------------------===// - int64_t bitWidth = captures.reductionOutputElementalTypeBitWidth; - int64_t vectorSize = scaleUpByBitWidth(4, bitWidth); - int64_t maxNumThreads = 8 * gpuModel.subgroupSize; - // No adjustments in the dynamic case, we need extra information to make a - // good decision. - if (ShapedType::isDynamic(redSize)) - return ReductionConfig{maxNumThreads, vectorSize, - ReductionStrategy::Staged}; - // Scale down to smaller sizes (4, 8, 16)-warps. - if (scaleUpByBitWidth(redSize, bitWidth) <= 4 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(1, bitWidth); - maxNumThreads = 4 * gpuModel.subgroupSize; - } else if (scaleUpByBitWidth(redSize, bitWidth) <= - 8 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(2, bitWidth); - maxNumThreads = 4 * gpuModel.subgroupSize; - } else if (scaleUpByBitWidth(redSize, bitWidth) <= - 8 * 2 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(4, bitWidth); - maxNumThreads = 4 * gpuModel.subgroupSize; - } - // Scale up to larger sizes (32, 64, 128+)-warps, using vector-4. - if (!captures.trailingOpSizes.empty()) { - if (scaleUpByBitWidth(redSize, bitWidth) >= - 128 * 4 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(4, bitWidth); - maxNumThreads = 32 * gpuModel.subgroupSize; - } else if (scaleUpByBitWidth(redSize, bitWidth) >= - 64 * 4 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(4, bitWidth); - maxNumThreads = 16 * gpuModel.subgroupSize; - } else if (scaleUpByBitWidth(redSize, bitWidth) >= - 32 * 4 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(4, bitWidth); - maxNumThreads = 8 * gpuModel.subgroupSize; - } else if (scaleUpByBitWidth(redSize, bitWidth) >= - 16 * 4 * gpuModel.subgroupSize) { - vectorSize = scaleUpByBitWidth(4, bitWidth); - maxNumThreads = 4 * gpuModel.subgroupSize; - } - } - return ReductionConfig{maxNumThreads, vectorSize, ReductionStrategy::Staged}; -} - -/// Map an N-D parallel, 1-D reduction operation with optional leading and -/// optional trailing elementwise operations. -/// The 1-D reduction dimension must be in the most minor dimension. -/// The innermost dimensions of the leading and trailing operations must be -/// most minor along all accesses. Return failure if matching fails. On a -/// successful match, configure a reduction strategy based on a proxy model of -/// the hardware and construct transform dialect IR that implements the -/// reduction strategy. The transform dialect IR is added in a top-level -/// ModuleOp after the `entryPoint` mlir::FunctionOpInterface. -static LogicalResult -matchAndSetReductionStrategy(mlir::FunctionOpInterface entryPoint, - linalg::LinalgOp op, const GPUModel &gpuModel) { - if (!gpuModel.hasWarpShuffle) { - LDBG("--Reduction strategy no warp shuffle\n"); - return failure(); - } - - // 1. Match a reduction and surrounding ops. - StructuredOpMatcher *reduction; - transform_ext::MatchedReductionCaptures captures; - transform_ext::MatcherContext matcherContext; - makeReductionMatcher(matcherContext, reduction, captures, - /*mustMatchEntireFunc=*/true); - if (!matchPattern(op, *reduction)) { - LDBG("--Reduction strategy failed to match\n"); - return failure(); - } - - // 2. Construct the configuration and the strategy builder. - // TODO: Generalize along the HW axis. - auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) { - ReductionConfig reductionConfig = getReductionConfig(captures, gpuModel); - if (reductionConfig.strategy == ReductionStrategy::Small) { - SmallReductionStrategy strategy(captures, reductionConfig, gpuModel); - return buildSmallReductionStrategy(b, variant, strategy); - } else if (reductionConfig.strategy == ReductionStrategy::Staged) { - // Otherwise, always fallback to the staged strategy. - StagedReductionStrategy strategy(captures, reductionConfig, gpuModel); - return buildStagedReductionStrategy(b, variant, strategy); - } else { - return llvm_unreachable("Unknown strategy"); - } - }; - - // 3. Build strategy embedded into the IR. - mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder); - - return success(); -} - -//===--------------------------------------------------------------------===// -// Matmul strategies. -//===--------------------------------------------------------------------===// -/// Placeholder to encode fixed matmuls that should take finer-grained -/// precedence over other heuristics. In the future, this could be lifted to -/// e.g. `gpuModel` or higher up in some transform dialect database summary of -/// "known good things". -static FailureOr applyKnownGoodMatmulConfigurations( - const transform_ext::MatchedMatmulCaptures &captures, - const GPUModel &gpuModel) { - return failure(); -} - -static int64_t -selectLargestFailsafeValueIfNeeded(int64_t value, int64_t limit, - ArrayRef thresholds, - ArrayRef failSafeValues) { - for (auto [threshold, failSafeValue] : - llvm::zip(thresholds, failSafeValues)) { - if (limit < threshold && value > failSafeValue) - return failSafeValue; - } - return value; -} - -static void failSafeOverrides(MatmulStrategy &strategy, - const GPUModel &gpuModel) { - // Failsafe for blockTileM to avoid tiling by > size (i.e. no tiling). - int64_t blockTileM = selectLargestFailsafeValueIfNeeded( - /*value=*/strategy.blockTileM(), - /*limit=*/strategy.m(), - /*thresholds=*/{2, 4, 8, 16, 32, 64, 128}, - /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64}); - // Failsafe for blockTileN to avoid tiling by > size (i.e. no tiling). - int64_t blockTileN = selectLargestFailsafeValueIfNeeded( - /*value=*/strategy.blockTileN(), - /*limit=*/strategy.n(), - /*thresholds=*/{2, 4, 8, 16, 32, 64, 128}, - /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64}); - // Failsafe for reductionSize to avoid tiling by > size (i.e. no tiling). - int64_t reductionTileSize = selectLargestFailsafeValueIfNeeded( - /*value=*/strategy.reductionTileSize, - /*limit=*/strategy.k(), - /*thresholds=*/{2, 4, 8, 16, 24, 32, 40, 48, 56, 64}, - /*failSafeValues=*/{1, 2, 4, 8, 16, 24, 32, 40, 48, 56}); - - // If some dimension is small, use fmas. - // TODO: more parallelism by locally splitting the K-loop and reducing in the - // fma case. - if (blockTileM < 16 || blockTileN < 16 || reductionTileSize < 16) { - strategy.useMmaSync = false; - strategy.useWmma = false; - strategy.useFma = true; - } - - strategy.blockTileSizes = {blockTileM, blockTileN}; - strategy.reductionTileSize = reductionTileSize; - - // Avoid too deep pipelines. This should also look at shared memory usage in - // the future. - if (strategy.pipelineDepth * strategy.reductionTileSize > strategy.k()) { - strategy.pipelineDepth = - llvm::divideFloorSigned(strategy.k(), strategy.reductionTileSize); - } -} - -/// The configurations below have been determined empirically. -// TODO: Significantly improve these heuristics. -static MatmulStrategy -getMatmulConfig(MLIRContext *context, - const transform_ext::MatchedMatmulCaptures &captures, - const GPUModel &gpuModel) { - MatmulStrategy strategy(context, captures, gpuModel); - if (strategy.cliOptionsSpecified) - return strategy; - - auto maybeHardcodedConfiguration = - applyKnownGoodMatmulConfigurations(captures, gpuModel); - if (succeeded(maybeHardcodedConfiguration)) - return *maybeHardcodedConfiguration; - - // TODO: encode a decision tree of reasonnable heuristics here. - - // Apply failsafe overrides to avoid identified bad corner cases. - failSafeOverrides(strategy, gpuModel); - - return strategy; -} - -/// Update the strategy to make sure it can be consumed by the codegen. In -/// particular, make sure that tile sizes are smaller than the problem sizes to -/// actually trigger tiling and mapping to blocks and threads. -static void failSafeOverrides(BatchMatmulStrategy &strategy, - const GPUModel &gpuModel) { - // Configure the strategy as if for a matmul. - failSafeOverrides(static_cast(strategy), gpuModel); - - // Failsafe for blockTileBatch to avoid tiling by > size (i.e. no tiling). - int64_t blockTileBatch = selectLargestFailsafeValueIfNeeded( - /*value=*/strategy.blockTileBatch(), - /*limit=*/strategy.batch(), - /*thresholds=*/{2, 4, 8, 16, 32, 64, 128}, - /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64}); - - // Override the matmul configuration to be suitable for batch matmul. - // Specifically, prepend the tile size for the batch dimension and force FMA. - strategy.blockTileSizes.insert(strategy.blockTileSizes.begin(), - blockTileBatch); - - strategy.useMmaSync = false; - strategy.useWmma = false; - strategy.useFma = true; -} - -/// Produce a strategy for the batch matmul characterized by the given capture -/// list (shapes and types). -static BatchMatmulStrategy getBatchMatmulConfig(MLIRContext *context, - MatchedMatmulCaptures &captures, - const GPUModel &gpuModel) { - // Command-line arguments trump everything. - BatchMatmulStrategy strategy(context, gpuModel, captures); - if (strategy.cliOptionsSpecified) - return strategy; - - // TODO: fixed strategies and decision tree/heuristic. - - failSafeOverrides(strategy, gpuModel); - return strategy; -} - -/// Match the supported batch matmuls and set the transform dialect strategy for -/// them. -static LogicalResult -matchAndSetBatchMatmulStrategy(mlir::FunctionOpInterface entryPoint, - linalg::LinalgOp op, const GPUModel &gpuModel) { - if (!clGPUEnableTransformDialectBatchMatmulStrategy) { - LDBG("--Batch matmul strategy flag turned off\n"); - return failure(); - } - - StructuredOpMatcher *fill; - StructuredOpMatcher *bmm; - transform_ext::MatchedMatmulCaptures captures; - transform_ext::MatcherContext matcherContext; - transform_ext::makeBatchMatmulMatcher(matcherContext, bmm, fill, captures, - /*mustMatchEntireFunc=*/true); - if (!matchPattern(op, *bmm)) { - LDBG("--Batch matmul strategy failed to match\n"); - return failure(); - } - - if (captures.contractionDims.batch.size() != 1 || - captures.contractionDims.m.size() != 1 || - captures.contractionDims.n.size() != 1 || - captures.contractionDims.k.size() != 1 || captures.batches()[0] != 0 || - captures.m() != 1 || captures.n() != 2 || captures.k() != 3) { - LDBG("--Only support batch matmul with b, m, n, k iterator order atm\n"); - return failure(); - } - - BatchMatmulStrategy strategy = - getBatchMatmulConfig(entryPoint->getContext(), captures, gpuModel); - if (failed(strategy.validate(gpuModel))) { - LDBG("--Batch matmul strategy failed to validate\n"); - return failure(); - } - - iree_compiler::createTransformRegion(entryPoint, [&](ImplicitLocOpBuilder &b, - Value variantH) { - return iree_compiler::gpu::buildBatchMatmulStrategy(b, variantH, strategy); - }); - return success(); -} - -static LogicalResult -matchAndSetMatmulStrategy(mlir::FunctionOpInterface entryPoint, - linalg::LinalgOp op, const GPUModel &gpuModel) { - if (!clGPUEnableTransformDialectMatmulTensorCoreStrategy) { - LDBG("--Matmul strategy flag turned off\n"); - return failure(); - } - - // 1. Match a reduction and surrounding ops. - StructuredOpMatcher *fill; - StructuredOpMatcher *matmul; - StructuredOpMatcher *trailing; - transform_ext::MatchedMatmulCaptures captures; - transform_ext::MatcherContext matcherContext; - makeMatmulMatcher(matcherContext, matmul, fill, trailing, captures, - /*mustMatchEntireFunc=*/true); - if (!matchPattern(op, *matmul)) { - LDBG("--Matmul strategy fail to match\n"); - return failure(); - } - - // We are very peculiar about the dispatches we want to match for now: - // - f32 only atm. - // - Mandatory fill op. - // - No trailing op. - // - If the matmul is "too aligned", then guard on the alignment flag. - // - If the matmul is "too small", then use the default IREE strategy. - // - Otherwise, we take it. - if (!fill->getCaptured() || trailing->getCaptured()) { - LDBG("--Matmul strategy fill / trailing preconditions failed\n"); - return failure(); - } - - // TODO: Generalize to a good mix of sizes, alignments and element types. - const auto &matmulSize = captures.matmulOpSizes; - if (matmulSize.size() != 3) { - LDBG("--Matmul strategy size capture failed\n"); - return failure(); - } - - // Currently the unaligned transform strategy does not properly handle - // degenerate dimensions that should have been rank-reduced (e.g. `1`). - // Also, it is unprofitable to force small matmuls through a high latency - // tensorcore path, we are better off with a simple simt strategy. - // TODO: profitability details can be ironed out in the future when we have a - // heuristic to better select strategy parameters. - bool smallCases = (matmulSize[0] > 0 && matmulSize[0] < 16) || - (matmulSize[1] > 0 && matmulSize[1] < 16) || - (matmulSize[2] > 0 && matmulSize[2] < 16); - if (smallCases && !clGPUEnableTransformDialectSmallMatmul) { - LDBG("--Matmul strategy small size check failed\n"); - return failure(); - } - - // Currently the fully aligned case still lags behind the current default - // pipeline and thus is guarded by a flag. This is the case when at least one - // of the following holds - // - m is tile aligned (conservatively, take 64) - // - n is tile aligned (conservatively, take 64) - // - k is tile aligned (conservatively, take 16) - bool guardedAlignedCases = matmulSize[0] % 64 == 0 || - matmulSize[1] % 64 == 0 || matmulSize[2] % 16 == 0; - - if (!smallCases && guardedAlignedCases && - !clGPUEnableTransformDialectAlignedMatmul) { - LDBG("--Matmul strategy alignment check failed\n"); - return failure(); - } - - iree_compiler::gpu::MatmulStrategy strategy = - getMatmulConfig(op->getContext(), captures, gpuModel); - LLVM_DEBUG(strategy.dump()); - - // Validate the strategy configuration against the compilation target. - if (failed(strategy.validate(gpuModel))) { - LDBG("--Matmul strategy failed to validate\n"); - return failure(); - } - - // Limit the types that we choose to support without user intervention for - // tensor core. - if (!strategy.useFma && !strategy.cliOptionsSpecified && - (!captures.lhsElementType.isF32() || !captures.rhsElementType.isF32() || - !captures.outputElementType.isF32())) { - LDBG("--Matmul strategy elemental type check failed\n"); - return failure(); - } - - // 2. Construct the configuration and the strategy builder. - // TODO: Generalize along the HW axis. - auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) { - return buildMatmulTensorCoreStrategy(b, variant, strategy); - }; - - // 3. Build strategy embedded into the IR. - mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder); - - return success(); -} - -//===--------------------------------------------------------------------===// -// Convolution strategies. -//===--------------------------------------------------------------------===// -/// Placeholder to encode fixed convolutions that should take finer-grained -/// precedence over other heuristics. In the future, this could be lifted to -/// e.g. `gpuModel` or higher up in some transform dialect database summary of -/// "known good things". -static FailureOr applyKnownGoodConvolutionConfigurations( - const transform_ext::MatchedConvolutionCaptures &captures, - const GPUModel &gpuModel) { - return failure(); -} - -static void failSafeOverrides(ImplicitGemmStrategy &strategy, - const GPUModel &gpuModel) { - // Prefer a default block tile of 1 for the batch. - strategy.blockTileSizes = SmallVector{1, 128, 128}; - // Failsafe for blockTileM to avoid tiling by > size (i.e. no tiling). - int64_t blockTileM = selectLargestFailsafeValueIfNeeded( - strategy.blockTileM(), strategy.m(), {16, 32, 64, 128}, {1, 16, 32, 64}); - // Failsafe for blockTileN to avoid tiling by > size (i.e. no tiling). - int64_t blockTileN = selectLargestFailsafeValueIfNeeded( - strategy.blockTileN(), strategy.n(), {16, 32, 64, 128}, {1, 16, 32, 64}); - // Failsafe for reductionSize to avoid tiling by > size (i.e. no tiling). - int64_t reductionTileSize = selectLargestFailsafeValueIfNeeded( - strategy.reductionTileSize, strategy.k(), {8, 16, 24, 32, 40, 48, 56, 64}, - {1, 8, 16, 24, 32, 40, 48, 56}); - // Failsafe for blockTileBatch to avoid tiling by > size (i.e. no tiling). - int64_t blockTileBatch = selectLargestFailsafeValueIfNeeded( - /*value=*/strategy.blockTileBatch(), - /*limit=*/strategy.batch(), - /*thresholds=*/{2, 4, 8, 16, 32, 64, 128}, - /*failSafeValues=*/{1, 2, 4, 8, 16, 32, 64}); - strategy.blockTileSizes = {blockTileBatch, blockTileM, blockTileN}; - strategy.reductionTileSize = reductionTileSize; - // Avoid too deep pipelines. This should also look at shared memory usage in - // the future. - if (strategy.pipelineDepth * strategy.reductionTileSize > strategy.k()) { - strategy.pipelineDepth = - llvm::divideFloorSigned(strategy.k(), strategy.reductionTileSize); - } -} - -/// The configurations below have been determined empirically. -// TODO: Significantly improve these heuristics. -static ImplicitGemmStrategy -getConvolutionConfig(MLIRContext *context, - const transform_ext::MatchedConvolutionCaptures &captures, - const GPUModel &gpuModel) { - ImplicitGemmStrategy strategy(context, captures, gpuModel); - if (strategy.cliOptionsSpecified) - return strategy; - - auto maybeHardcodedConfiguration = - applyKnownGoodConvolutionConfigurations(captures, gpuModel); - if (succeeded(maybeHardcodedConfiguration)) - return *maybeHardcodedConfiguration; - - // TODO: encode a decision tree of reasonnable heuristics here. - - // Apply failsafe overrides to avoid identified bad corner cases. - failSafeOverrides(strategy, gpuModel); - - return strategy; -} - -static LogicalResult -matchAndSetConvolutionStrategy(mlir::FunctionOpInterface entryPoint, - linalg::LinalgOp op, const GPUModel &gpuModel) { - if (!clGPUEnableTransformDialectImplicitGemmStrategy) { - LDBG("--Implicit gemm strategy flag turned off\n"); - return failure(); - } - - // 1. Match a reduction and surrounding ops. - StructuredOpMatcher *fill; - StructuredOpMatcher *convolution; - StructuredOpMatcher *trailing; - transform_ext::MatchedConvolutionCaptures captures; - transform_ext::MatcherContext matcherContext; - makeConvolutionMatcher(matcherContext, convolution, fill, trailing, captures, - /*mustMatchEntireFunc=*/true); - if (!matchPattern(op, *convolution)) { - LDBG("--Implicit gemm strategy fail to match\n"); - return failure(); - } - - // We are very peculiar about the dispatches we want to match for now: - // - f32 or f16 only atm. - // - Mandatory fill op. - // - Require minimum tile alignment due to img2col. - // - Otherwise, we take it. - if (!fill->getCaptured() || trailing->getCaptured()) { - LDBG("--Implicit gemm strategy fill / trailing preconditions failed\n"); - return failure(); - } - - // Currently requires a typical 2d named convolution (conv_2d_nchw/nhwc). - if (captures.convolutionDims.outputChannel.size() != 1) { - return failure(); - } - if (captures.convolutionDims.inputChannel.size() != 1) { - return failure(); - } - if (captures.convolutionDims.outputImage.size() != 2) { - return failure(); - } - if (captures.convolutionDims.filterLoop.size() != 2) { - return failure(); - } - if (captures.convolutionDims.batch.size() != 1) { - return failure(); - } - - int64_t channelSize = 1; - for (auto dim : captures.convolutionDims.outputChannel) - channelSize *= captures.convolutionOpSizes[dim]; - int64_t imageSize = 1; - for (auto dim : captures.convolutionDims.outputImage) - imageSize *= captures.convolutionOpSizes[dim]; - - int64_t derivedK = 1; - for (auto dim : captures.convolutionDims.filterLoop) - derivedK *= captures.convolutionOpSizes[dim]; - for (auto dim : captures.convolutionDims.inputChannel) - derivedK *= captures.convolutionOpSizes[dim]; - - // Require tile-aligned due to the img2col op. - if (channelSize % 64 || imageSize % 64 || derivedK % 16) { - LDBG("--Implicit gemm strategy alignment check failed\n"); - return failure(); - } - - iree_compiler::gpu::ImplicitGemmStrategy strategy = - getConvolutionConfig(op->getContext(), captures, gpuModel); - - // Validate the strategy configuration against the compilation target. - if (failed(strategy.validate(gpuModel))) { - LDBG("--Implicit gemm strategy failed to validate\n"); - return failure(); - } - - // 2. Construct the configuration and the strategy builder. - // TODO: Generalize along the HW axis. - auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) { - return buildConvolutionImplicitGemmStrategy(b, variant, strategy); - }; - - // 3. Build strategy embedded into the IR. - mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder); - - return success(); -} - -//===--------------------------------------------------------------------===// -// Pad strategies. -//===--------------------------------------------------------------------===// - -/// Placeholder to encode fixed pads that should take finer-grained precedence -/// over other heuristics. In the future, this could be lifted to -/// e.g. `gpuModel` or higher up in some transform dialect database summary of -/// "known good things". -static FailureOr applyKnownGoodPadConfigurations( - const transform_ext::MatchedPadCaptures &captures, - const GPUModel &gpuModel) { - if (ArrayRef{captures.dims} == ArrayRef{1024, 1024}) { - return PadConfig{}; - } - return failure(); -} - -/// Placeholder to encode simple heuristics. -static PadConfig getPadConfig(const transform_ext::MatchedPadCaptures &captures, - const GPUModel &gpuModel) { - auto maybeHardcodedConfiguration = - applyKnownGoodPadConfigurations(captures, gpuModel); - if (succeeded(maybeHardcodedConfiguration)) - return *maybeHardcodedConfiguration; - return PadConfig{}; -} - -static LogicalResult -matchAndSetPadStrategy(mlir::FunctionOpInterface entryPoint, tensor::PadOp op, - const GPUModel &gpuModel) { - if (!clGPUEnableTransformDialectPadStrategy) { - LDBG("--Pad strategy flag turned off\n"); - return failure(); - } - - // 1. Match a padOp. - CapturingOpMatcher *pad; - MatchedPadCaptures captures; - MatcherContext matcherContext; - makePadMatcher(matcherContext, pad, captures, /*mustMatchEntireFunc=*/true); - - if (!matchPattern(op.getOperation(), *pad)) { - LDBG("--Pad strategy failed to match\n"); - return failure(); - } - if (captures.rank != 2) { - LDBG("--Pad strategy supported ranks check failed\n"); - return failure(); - } - if (!captures.elementType.isF32()) { - LDBG("--Pad strategy elemental type check failed\n"); - return failure(); - } - - // 2. Construct the strategy builder. - PadConfig padConfig = getPadConfig(captures, gpuModel); - iree_compiler::gpu::PadStrategy strategy(op->getContext(), captures, - padConfig, gpuModel); - if (strategy.useAsyncCopies) { - LDBG("--Async copies not supported yet\n"); - return failure(); - } - if (strategy.numThreads.size() > 3) { - LDBG("--Can only assign 3 num threads\n"); - return failure(); - } - // Make sure all thread numbers are set. - if (strategy.numThreads.size() != 3) { - strategy.numThreads.resize(3, 1); - } - - auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) { - return buildPadStrategy(b, variant, strategy); - }; - - // 3. Build strategy embedded into the IR. - mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder); - - return success(); -} - -//===--------------------------------------------------------------------===// -// Switch between strategies depending on matched IR. -//===--------------------------------------------------------------------===// -LogicalResult mlir::iree_compiler::gpu::matchAndSetTransformStrategy( - mlir::FunctionOpInterface entryPoint, Operation *op, - const GPUModel &gpuModel) { - LDBG("Look up a TD strategy for entryPoint:\n" << entryPoint << "\n"); - auto padOp = dyn_cast(op); - if (padOp) { - if (succeeded(matchAndSetPadStrategy(entryPoint, padOp, gpuModel))) { - LDBG("Activate pad strategy\n"); - return success(); - } - LDBG("Unmatched pad strategy\n"); - return failure(); - } - auto linalgOp = dyn_cast(op); - if (!linalgOp) { - LDBG("Not a Linalg op: " << *op << " -> Fail\n"); - return failure(); - } - if (succeeded(matchAndSetReductionStrategy(entryPoint, linalgOp, gpuModel))) { - LDBG("Activate reduction strategy\n"); - return success(); - } - if (succeeded(matchAndSetMatmulStrategy(entryPoint, linalgOp, gpuModel))) { - LDBG("Activate matmul\n"); - return success(); - } - if (succeeded( - matchAndSetBatchMatmulStrategy(entryPoint, linalgOp, gpuModel))) { - LDBG("Activate batch matmul\n"); - return success(); - } - if (succeeded( - matchAndSetConvolutionStrategy(entryPoint, linalgOp, gpuModel))) { - LDBG("Activate convolution\n"); - return success(); - } - // TODO: Add more transform dialect strategy for other kind of dispatch - // regions. - LDBG("No suitable strategy found\n"); - return failure(); -} diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h deleted file mode 100644 index d8093faa9840..000000000000 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2023 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_ -#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_ - -#include "llvm/ADT/StringRef.h" -#include "mlir/Interfaces/FunctionInterfaces.h" - -namespace mlir { -class ImplicitLocOpBuilder; -class Value; -} // namespace mlir - -namespace mlir::iree_compiler::gpu { - -/// Forward declarations of all supported strategies. -class BatchMatmulStrategy; -class MatmulStrategy; -class PadStrategy; -class SmallReductionStrategy; -class StagedReductionStrategy; - -static constexpr int64_t kCudaWarpSize = 32; -static constexpr int64_t kCudaMaxNumThreads = 1024; - -/// Struct for representing supported WMMA/Cooperative Matrix configurations. -/// This is a reflection of SPIRV_CooperativeMatrixPropertiesNVAttr. -struct MMAConfig { - int64_t m; - int64_t n; - int64_t k; - Type aType; - Type bType; - Type cType; -}; - -/// Placeholder for some hardware model proxy that contains relevant information -/// to configure the strategies. In the future, this will need to be -/// driven by some contract with the runtime. -struct GPUModel { - static constexpr llvm::StringLiteral kDefaultGPU = "DefaultGPU"; - llvm::StringRef model = kDefaultGPU; - /// TODO: Support a range of subgroup sizes. - int64_t subgroupSize = kCudaWarpSize; - std::optional minSubgroupSize = std::nullopt; - std::optional maxSubgroupSize = std::nullopt; - int64_t maxWorkGroupInvocations = kCudaMaxNumThreads; - int64_t maxWorkGroupSize[3] = {1024, 1024, 64}; - bool hasWarpShuffle = false; - bool hasTF32TensorCore = false; - bool hasMmaSync = false; - SmallVector supportedWMMAConfigs = {}; -}; - -//===--------------------------------------------------------------------===// -// GPU strategy base. -//===--------------------------------------------------------------------===// -/// Basic structure to hold target specific information needed for all gpu -/// strategies. Certain quantities that can be dynamically selected, such as -/// subgroup size, will need to be configured with some contract with the -/// runtime. -struct GPUStrategy { - /// TODO: Configure subgroup size with the strategy and return the selected - /// size to the target (i.e. LLVMGPU or SPIR-V). - GPUStrategy(const GPUModel &gpuModel) : subgroupSize(gpuModel.subgroupSize) {} - /// TODO: Add other quantities relevant to strategy builders. - int64_t subgroupSize; -}; - -//===--------------------------------------------------------------------===// -// Matmul strategies. -//===--------------------------------------------------------------------===// -/// Entry point to build the transform IR corresponding to a tensorcore-based -/// strategy for linalg.fill + linalg.matmul on f32. -/// Does not support leading or trailing operations atm. -void buildMatmulTensorCoreStrategy(ImplicitLocOpBuilder &b, Value variantH, - const MatmulStrategy &strategy); - -//===--------------------------------------------------------------------===// -// Batch matmul strategies. -//===--------------------------------------------------------------------===// -/// Entry point to build the transform IR corresponding to an FMA-based strategy -/// for linalg.fill + linalg.batch_matmul. -void buildBatchMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH, - const BatchMatmulStrategy &strategy); - -//===--------------------------------------------------------------------===// -// Pad strategies. -//===--------------------------------------------------------------------===// -/// Entry point to build the transform IR corresponding to a simple pad -/// strategy. -/// Does not support leading or trailing operations atm. -void buildPadStrategy(ImplicitLocOpBuilder &b, Value variantH, - const PadStrategy &strategy); - -//===--------------------------------------------------------------------===// -// Reduction strategies. -//===--------------------------------------------------------------------===// -/// Structure to hold a summary of HW-derived properties to configure the -/// reduction strategy. -/// The objective of this struct is to act as a minimal summary of key -/// properties derived from the hardware (e.g. by an oracle) and that are -/// sufficient to steer the strategy to produce a good version. -/// These can be thought of as latent variables or embeddings that directly -/// control the strategy and can be derived from the hardware by some procedure. -enum class ReductionStrategy { Small, Staged }; -struct ReductionConfig { - int64_t maxNumThreads; - int64_t vectorSize; - ReductionStrategy strategy; -}; - -/// Entry point to build the transform IR corresponding to a staged reduction -/// strategy. -/// This is used for mapping a N-D parallel, 1-D reduction operation with a -/// small reduction on which the default staged reduction strategy is otherwise -/// inefficient. -/// The 1-D reduction dimensions must be in the most minor dimension. -/// Supports an optional leading and an optional trailing elementwise operation. -void buildSmallReductionStrategy(ImplicitLocOpBuilder &b, Value variantH, - const SmallReductionStrategy &strategy); - -/// Entry point to build the transform IR corresponding to a staged reduction -/// strategy. -/// This is used for mapping a N-D parallel, 1-D reduction operation. -/// The 1-D reduction dimensions must be in the most minor dimension. -/// Supports an optional leading and an optional trailing elementwise operation. -void buildStagedReductionStrategy(ImplicitLocOpBuilder &b, Value variantH, - const StagedReductionStrategy &strategy); - -//===----------------------------------------------------------------------===// -// Higher-level strategy creation APIs, these should favor -// user-friendliness. -//===----------------------------------------------------------------------===// - -/// Try to find an exisiting transform dialect strategy for a given entry point. -LogicalResult matchAndSetTransformStrategy(mlir::FunctionOpInterface entryPoint, - Operation *op, - const GPUModel &gpuModel); - -} // namespace mlir::iree_compiler::gpu - -#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_STRATEGIES_H_ diff --git a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp index 7ab5d266dbdb..c4d5d62349df 100644 --- a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp @@ -8,14 +8,6 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Codegen/VMVX/KernelDispatch.h" #include "iree/compiler/Codegen/VMVX/Passes.h" -#include "iree/compiler/Dialect/HAL/IR/HALDialect.h" -#include "iree/compiler/Dialect/HAL/IR/HALOps.h" -#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" @@ -34,18 +26,7 @@ class VMVXSelectLoweringStrategyPass VMVXSelectLoweringStrategyPass> { public: void getDependentDialects(DialectRegistry ®istry) const override { - // TODO(qedawkins): Once TransformStrategies is deprecated, drop the - // unnecessary dialect registrations. - // clang-format off - registry.insert(); - // clang-format on + registry.insert(); } void runOnOperation() override;