Skip to content

Commit

Permalink
[LLVMGPU][ROCm] Move kernel annotation before serialization (#18573)
Browse files Browse the repository at this point in the history
ROCMTarget serialization is not the best location for this code because
it violates the following invariant in
`buildLLVMGPUCodegenPassPipeline`:
>  - The module contains the final llvm.module ready to be serialized.

Specifically, requiring the ROCDL dialect to be loaded is problemantic
during serialization, as target-agnostic serialization in
(`iree-hal-serialize-executables`) does not register dependent dialects.

This PR moves kernel annotation just after conversion to ROCDL in the
`LowerToGPUPasses` pass pipeline.

Also add kernel annotation tests that were not straightforward to add
before, as annotation was not a freestanding pass whose output could be
inspected.
  • Loading branch information
kuhar authored Sep 23, 2024
1 parent 5a6bd8d commit eef4623
Show file tree
Hide file tree
Showing 11 changed files with 274 additions and 91 deletions.
96 changes: 7 additions & 89 deletions compiler/plugins/target/ROCM/ROCMTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
#include "iree/compiler/PluginAPI/Client.h"
#include "iree/compiler/Utils/FlatbufferUtils.h"
#include "iree/compiler/Utils/ModuleUtils.h"
#include "iree/compiler/Utils/ToolUtils.h"
#include "iree/schemas/hip_executable_def_builder.h"
#include "llvm/ADT/StringExtras.h"
Expand All @@ -37,17 +36,12 @@
#include "llvm/Passes/StandardInstrumentations.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/DialectResourceBlobManager.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
Expand Down Expand Up @@ -136,68 +130,6 @@ struct ROCmOptions {
}
};

// Extracts the amdgpu chipset version from the chip architecture in the
// executable target attribute.
static FailureOr<amdgpu::Chipset>
getChipsetVersion(ExecutableTargetAttr targetAttr) {
IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(targetAttr);
if (!gpuTarget)
return failure();

return amdgpu::Chipset::parse(gpuTarget.getArch());
}

// Set attributes on `funcOp` in order to use upstream's translation of
// ROCDL dialect attributes to LLVM. Primarily this is `rocdl.kernel`
// (sets the calling convention and workgroup size uniformity) but this will
// also set both forms of workgroup size metadata from `exportOp` (if it is set)
// and will set the waves_per_eq flag where relevant. Finally, it will mark
// kernel arguments `inreg` to enable argument preloading on supported
// architectures.
static void annotateKernelForTranslation(LLVM::LLVMFuncOp funcOp,
ExecutableExportOp exportOp,
ExecutableTargetAttr targetAttr,
OpBuilder &builder) {
auto *rocdlDialect =
funcOp.getContext()->getLoadedDialect<ROCDL::ROCDLDialect>();
UnitAttr unitAttr = builder.getUnitAttr();
rocdlDialect->getKernelAttrHelper().setAttr(funcOp, unitAttr);
std::optional<ArrayAttr> workgroupSizeAttr = exportOp.getWorkgroupSize();
if (workgroupSizeAttr && workgroupSizeAttr->size() <= 3) {
std::array<int32_t, 3> wgSizes;
int32_t flatWgSize = 1;
for (auto [value, attr] : llvm::zip_equal(
wgSizes, workgroupSizeAttr->getAsRange<IntegerAttr>())) {
value = attr.getInt();
flatWgSize *= value;
}
rocdlDialect->getReqdWorkGroupSizeAttrHelper().setAttr(
funcOp, builder.getDenseI32ArrayAttr(wgSizes));
rocdlDialect->getFlatWorkGroupSizeAttrHelper().setAttr(
funcOp,
builder.getStringAttr(Twine(flatWgSize) + "," + Twine(flatWgSize)));
}

if (std::optional<IntegerAttr> attr =
getConfigIntegerAttr(targetAttr, "waves_per_eu")) {
rocdlDialect->getWavesPerEuAttrHelper().setAttr(funcOp, *attr);
}

// Kernel argument preloading is only supported on gfx940 and newer targets
// from the CDNA family. This is enabled using the `inreg` function argument
// attribute.
FailureOr<amdgpu::Chipset> chipset = getChipsetVersion(targetAttr);
if (failed(chipset))
return;
if (chipset->majorVersion != 9 || *chipset < amdgpu::Chipset(9, 4, 0))
return;

auto inRegAttrName =
builder.getStringAttr(LLVM::LLVMDialect::getInRegAttrName());
for (unsigned i = 0, e = funcOp.getNumArguments(); i < e; ++i)
funcOp.setArgAttr(i, inRegAttrName, unitAttr);
}

static void dumpModuleToPath(StringRef path, StringRef baseName,
StringRef suffix, StringRef extension,
llvm::Module &module) {
Expand Down Expand Up @@ -318,8 +250,6 @@ class ROCMTargetBackend final : public TargetBackend {
registry.insert<IREE::Codegen::IREECodegenDialect>();
registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
registry.insert<IREE::GPU::IREEGPUDialect>();
registry.insert<amdgpu::AMDGPUDialect>();
registry.insert<ROCDL::ROCDLDialect>();
}

void
Expand Down Expand Up @@ -407,11 +337,8 @@ class ROCMTargetBackend final : public TargetBackend {
// Collect all the entry point names.
auto exportOps = llvm::to_vector_of<IREE::HAL::ExecutableExportOp>(
variantOp.getExportOps());
llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOpMap;
std::optional<uint32_t> subgroupSize;
for (IREE::HAL::ExecutableExportOp exportOp : exportOps) {
exportOpMap[exportOp.getSymName()] = exportOp;

// TODO: put this either on the variant or propagate as a function
// attribute instead - today this *must* be consistent across all exports
// and it shouldn't need to be.
Expand All @@ -436,7 +363,9 @@ class ROCMTargetBackend final : public TargetBackend {
if (!variantOp.getObjects().has_value()) {
return variantOp.emitOpError()
<< "no objects defined for external variant";
} else if (variantOp.getObjects()->getValue().size() != 1) {
}

if (variantOp.getObjects()->getValue().size() != 1) {
// For now we assume there will be exactly one object file.
// In the future we will want to perform a linking step here and ideally
// support _also_ linking in the codegen results.
Expand All @@ -457,17 +386,6 @@ class ROCMTargetBackend final : public TargetBackend {
// Perform the translation in a separate context to avoid any
// multi-threading issues.
llvm::LLVMContext context;

// Set up attributes so upstream's conversions work right.
for (auto func : innerModuleOp.getOps<LLVM::LLVMFuncOp>()) {
// Un-exported functions are library functions or otherwise
// not kernels, so don't need these annotations.
if (!exportOpMap.contains(func.getName()))
continue;
annotateKernelForTranslation(func, exportOpMap[func.getName()],
targetAttr, executableBuilder);
}

std::unique_ptr<llvm::Module> llvmModule =
mlir::translateModuleToLLVMIR(innerModuleOp, context, libraryName);
if (!llvmModule) {
Expand All @@ -486,10 +404,10 @@ class ROCMTargetBackend final : public TargetBackend {
for (NamedAttribute funcAttr : funcAttrs) {
auto value = dyn_cast<StringAttr>(funcAttr.getValue());
if (!value) {
return variantOp->emitError("llvm_func_attrs attribute must be "
"adictionary of strings. Attribute " +
llvm::Twine(funcAttr.getName()) +
" is not a StringAttr.");
return variantOp->emitError()
<< "llvm_func_attrs attribute must be a dictionary of "
"strings. Attribute "
<< funcAttr.getName() << " is not a StringAttr.";
}
llvmFunc->addFnAttr(funcAttr.getName(), value.getValue());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ builtin.module {
}
builtin.module {
llvm.func @external_func() attributes {sym_visibility = "private"}
llvm.func @test() {
llvm.func @test() attributes { rocdl.kernel } {
llvm.call @external_func() : () -> ()
llvm.return
}
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ iree_compiler_cc_library(
"LLVMGPUVectorLowering.cpp",
"LLVMGPUVectorToGPU.cpp",
"Passes.cpp",
"ROCDLAnnotateKernelForTranslation.cpp",
"ROCDLKernelConfig.cpp",
"ROCDLLowerExecutableTarget.cpp",
"ROCDLSelectLoweringStrategy.cpp",
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ iree_cc_library(
"LLVMGPUVectorLowering.cpp"
"LLVMGPUVectorToGPU.cpp"
"Passes.cpp"
"ROCDLAnnotateKernelForTranslation.cpp"
"ROCDLKernelConfig.cpp"
"ROCDLLowerExecutableTarget.cpp"
"ROCDLSelectLoweringStrategy.cpp"
Expand Down
3 changes: 3 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
Expand Down Expand Up @@ -1099,6 +1100,8 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
if (forROCDL) {
// convert to ROCDL.
modulePassManager.addPass(createConvertToROCDLPass());
modulePassManager.addNestedPass<LLVM::LLVMFuncOp>(
createROCDLAnnotateKernelForTranslationPass());
} else {
// convert to NVVM.
modulePassManager.addPass(createConvertToNVVMPass());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <cassert>
#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "llvm/Support/LogicalResult.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"

namespace mlir::iree_compiler {

#define GEN_PASS_DEF_ROCDLANNOTATEKERNELFORTRANSLATIONPASS
#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"

namespace {
// Extracts the amdgpu chipset version from the chip architecture in the
// executable target attribute.
static FailureOr<amdgpu::Chipset>
getChipsetVersion(IREE::HAL::ExecutableTargetAttr targetAttr) {
IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(targetAttr);
assert(gpuTarget);
return amdgpu::Chipset::parse(gpuTarget.getArch());
}

// Set attributes on `funcOp` in order to use upstream's translation of
// ROCDL dialect attributes to LLVM. Primarily this is `rocdl.kernel`
// (sets the calling convention and workgroup size uniformity) but this will
// also set both forms of workgroup size metadata from `exportOp` (if it is set)
// and will set the waves_per_eq flag where relevant. Finally, it will mark
// kernel arguments `inreg` to enable argument preloading on supported
// architectures.
static LogicalResult
annotateKernelForTranslation(LLVM::LLVMFuncOp funcOp,
IREE::HAL::ExecutableVariantOp variantOp,
IREE::HAL::ExecutableExportOp exportOp) {
OpBuilder builder(funcOp);
auto *rocdlDialect =
funcOp.getContext()->getLoadedDialect<ROCDL::ROCDLDialect>();
assert(rocdlDialect && "ROCDL dialect not loaded");
UnitAttr unitAttr = builder.getUnitAttr();
rocdlDialect->getKernelAttrHelper().setAttr(funcOp, unitAttr);
std::optional<ArrayAttr> workgroupSizeAttr = exportOp.getWorkgroupSize();
if (workgroupSizeAttr && workgroupSizeAttr->size() <= 3) {
std::array<int32_t, 3> wgSizes;
int32_t flatWgSize = 1;
for (auto [value, attr] : llvm::zip_equal(
wgSizes, workgroupSizeAttr->getAsRange<IntegerAttr>())) {
value = attr.getInt();
flatWgSize *= value;
}
rocdlDialect->getReqdWorkGroupSizeAttrHelper().setAttr(
funcOp, builder.getDenseI32ArrayAttr(wgSizes));
rocdlDialect->getFlatWorkGroupSizeAttrHelper().setAttr(
funcOp,
builder.getStringAttr(Twine(flatWgSize) + "," + Twine(flatWgSize)));
}

IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
if (std::optional<IntegerAttr> attr =
getConfigIntegerAttr(targetAttr, "waves_per_eu")) {
rocdlDialect->getWavesPerEuAttrHelper().setAttr(funcOp, *attr);
}

// Kernel argument preloading is only supported on gfx940 and newer targets
// from the CDNA family. This is enabled using the `inreg` function argument
// attribute.
FailureOr<amdgpu::Chipset> chipset = getChipsetVersion(targetAttr);
if (failed(chipset))
return variantOp.emitError() << "failed to parse amdgpu chipset";

if (chipset->majorVersion != 9 || *chipset < amdgpu::Chipset(9, 4, 0))
return success();

auto inRegAttrName =
builder.getStringAttr(LLVM::LLVMDialect::getInRegAttrName());
for (unsigned i = 0, e = funcOp.getNumArguments(); i < e; ++i)
funcOp.setArgAttr(i, inRegAttrName, unitAttr);

return success();
}

/// Lowers an IREE hal.executable.variant operation using a suitable pass
/// pipeline.
struct ROCDLAnnotateKernelForTranslationPass final
: impl::ROCDLAnnotateKernelForTranslationPassBase<
ROCDLAnnotateKernelForTranslationPass> {
void runOnOperation() override {
LLVM::LLVMFuncOp funcOp = getOperation();
StringRef funcName = funcOp.getName();

auto variantOp = funcOp->getParentOfType<IREE::HAL::ExecutableVariantOp>();
if (!variantOp) {
funcOp.emitError() << "cannot find parent hal.executable.variant op";
return signalPassFailure();
}

IREE::HAL::ExecutableExportOp exportOp;
// Try to find the matching executable export op.
for (IREE::HAL::ExecutableExportOp candidate : variantOp.getExportOps()) {
if (candidate.getSymName() == funcName) {
exportOp = candidate;
break;
}
}

// Un-exported functions are library functions or otherwise not kernels, so
// don't need these annotations.
if (!exportOp)
return;

if (failed(annotateKernelForTranslation(funcOp, variantOp, exportOp))) {
return signalPassFailure();
}
}
};
} // namespace
} // namespace mlir::iree_compiler
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLPASSES_H_

#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Pass/Pass.h"

namespace mlir::iree_compiler {
Expand Down
7 changes: 6 additions & 1 deletion compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ include "mlir/Pass/PassBase.td"
// ROCDL Passes (keep alphabetical)
//===----------------------------------------------------------------------===//

def ROCDLAnnotateKernelForTranslationPass : Pass<
"iree-rocdl-annotate-kernel-for-translation", "LLVM::LLVMFuncOp"> {
let summary = "Set function attributes before translating to LLVM IR";
let dependentDialects = ["ROCDL::ROCDLDialect"];
}

def ROCDLLowerExecutableTargetPass : InterfacePass<
"iree-rocdl-lower-executable-target", "mlir::FunctionOpInterface"> {
let summary = "Lower an IREE hal.executable.variant op using a suitable "
Expand All @@ -25,5 +31,4 @@ def ROCDLSelectLoweringStrategyPass :
"hal.executable.variant op";
}


#endif // IREE_CODEGEN_LLVMGPU_ROCDLPASSES
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ iree_lit_test_suite(
name = "lit",
srcs = enforce_glob(
[
"annotate_kernel_for_translation.mlir",
"config_tile_and_fuse.mlir",
"config_vector_distribute.mlir",
"config_user_vector_distribute.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ iree_lit_test_suite(
NAME
lit
SRCS
"annotate_kernel_for_translation.mlir"
"config_tile_and_fuse.mlir"
"config_user_vector_distribute.mlir"
"config_vector_distribute.mlir"
Expand Down
Loading

0 comments on commit eef4623

Please sign in to comment.