Skip to content

Commit

Permalink
[Transform] Add transform.iree.pack_shared_memory_alloc (#14503)
Browse files Browse the repository at this point in the history
This patch adds the `transform.iree.pack_shared_memory_alloc` op. This
op simply takes in a funcOp and applies LLVMGPUPackSharedMemoryAlloc
pass on it.
  • Loading branch information
Groverkss authored Aug 2, 2023
1 parent b9c0623 commit fb8263f
Show file tree
Hide file tree
Showing 11 changed files with 167 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,54 +9,11 @@
#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
#include "iree/compiler/Codegen/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/Dominance.h"

namespace mlir {
namespace iree_compiler {

/// Insert barriers and wait operations if there are allocs of a different alias
/// group before the given alloc.
static void addBarrier(func::FuncOp funcOp, Operation *alloc,
ArrayRef<Operation *> aliasGroup) {
Block *entryBlock = &(*funcOp.getBlocks().begin());
bool needBarrier = false;
if (alloc->getBlock() != entryBlock) {
needBarrier = true;
} else {
for (Operation &op : entryBlock->getOperations()) {
if (&op == alloc)
break;
if (op.getNumRegions() != 0) {
needBarrier = true;
break;
}
if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
needBarrier = true;
break;
}
}
}
if (!needBarrier)
return;
OpBuilder builder(alloc);
// TODO: make it a option if needed.
bool hasAsyncCopies = true;
if (hasAsyncCopies) {
Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
SmallVector<Value>());
builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
builder.getI32IntegerAttr(0));
}
builder.create<gpu::BarrierOp>(alloc->getLoc());
}

namespace {

struct LLVMGPUPackSharedMemoryAllocPass
Expand All @@ -67,35 +24,7 @@ struct LLVMGPUPackSharedMemoryAllocPass
registry.insert<nvgpu::NVGPUDialect>();
}

void runOnOperation() override {
func::FuncOp funcOp = getOperation();
DominanceInfo dominators(funcOp);
SmallVector<Operation *> allocs;
funcOp.walk([&](memref::AllocOp alloc) {
if (hasSharedMemoryAddressSpace(alloc.getType())) {
allocs.push_back(alloc);
}
});
// First sink the alloc as low as possible in the CFG.
sinkOpsInCFG(allocs, dominators);
SmallVector<AliasGroup> aliasGroups;
analyseAllocsForPacking(funcOp, allocs, aliasGroups);
// If there is 1 or less alias group there is nothing to do.
if (aliasGroups.size() <= 1)
return;

// Pack all the allocations into one i8 alloc.
// We may need to add extra barriers to make sure we are done writting or
// reading from the previous alias group before starting a new one.
for (size_t i = 0; i < aliasGroups.size(); i++) {
for (Operation *alloc : aliasGroups[i]) {
addBarrier(funcOp, alloc, aliasGroups[i]);
}
}

OpBuilder builder(funcOp.getContext());
packAllocs(builder, funcOp, aliasGroups);
}
void runOnOperation() override { packSharedMemoryAlloc(getOperation()); }
};
} // namespace

Expand Down
4 changes: 4 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
namespace mlir {
namespace iree_compiler {

//===----------------------------------------------------------------------===//
// Passes
//===----------------------------------------------------------------------===//

/// Lowering using SIMT CUDA core operations.
void addGPUMatmulSimtPassPipeline(OpPassManager &pm);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ using llvm::dbgs;
#define DBGS_VECTOR_TO_MMA() (dbgs() << '[' << DEBUG_VECTOR_TO_MMA << "] ")

using namespace mlir;
using namespace mlir::iree_compiler;
using namespace mlir::iree_compiler::IREE;

iree_compiler::IREE::transform_dialect::LLVMGPUExtensions::LLVMGPUExtensions() {
Expand Down Expand Up @@ -1478,5 +1479,20 @@ transform_dialect::EliminateGpuBarriersOp::applyToOne(
return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure
transform_dialect::PackSharedMemoryAllocOp::applyToOne(
transform::TransformRewriter &rewriter, func::FuncOp target,
transform::ApplyToEachResultList &results,
transform::TransformState &state) {
packSharedMemoryAlloc(target);
return DiagnosedSilenceableFailure::success();
}

void transform_dialect::PackSharedMemoryAllocOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
transform::onlyReadsHandle(getTarget(), effects);
transform::modifiesPayload(effects);
}

#define GET_OP_CLASSES
#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc"
Original file line number Diff line number Diff line change
Expand Up @@ -648,4 +648,38 @@ def EliminateGpuBarriersOp :
}];
}

def PackSharedMemoryAllocOp : Op<Transform_Dialect, "iree.pack_shared_memory_alloc",
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
TransformEachOpTrait,
TransformOpInterface,
ReportTrackingListenerFailuresOpTrait]> {
let summary = "Pack shared memory allocation to reduce memory usage";
let description = [{
Looks for allocs in shared memory space with overlapping liveness and
groups them, then packs all the allocations in each group into one i8
alloc. Also adds barriers to make sure we are done writing/reading
from the previous alias group before starting a new one.

#### Return modes

It does not consume the target handle and always return success.
}];

let arguments = (
ins TransformHandleTypeInterface:$target
);
let results = (outs);

let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
::mlir::func::FuncOp funcOp,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);
}];
}

#endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ iree_compiler_cc_library(
"LLVMGPUUtils.h",
],
deps = [
"//compiler/src/iree/compiler/Codegen/Transforms",
"//compiler/src/iree/compiler/Codegen/Utils",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:AffineDialect",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ iree_cc_library(
MLIRMemRefDialect
MLIRNVGPUDialect
MLIRVectorDialect
iree::compiler::Codegen::Transforms
iree::compiler::Codegen::Utils
PUBLIC
)
Expand Down
67 changes: 67 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"

#include "iree/compiler/Codegen/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
Expand Down Expand Up @@ -348,5 +349,71 @@ void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp) {
}
}

/// Insert barriers and wait operations if there are allocs of a different alias
/// group before the given alloc.
static void addBarrier(func::FuncOp funcOp, Operation *alloc,
ArrayRef<Operation *> aliasGroup) {
Block *entryBlock = &(*funcOp.getBlocks().begin());
bool needBarrier = false;
if (alloc->getBlock() != entryBlock) {
needBarrier = true;
} else {
for (Operation &op : entryBlock->getOperations()) {
if (&op == alloc)
break;
if (op.getNumRegions() != 0) {
needBarrier = true;
break;
}
if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
needBarrier = true;
break;
}
}
}
if (!needBarrier)
return;
OpBuilder builder(alloc);
// TODO: make it a option if needed.
bool hasAsyncCopies = true;
if (hasAsyncCopies) {
Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
SmallVector<Value>());
builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
builder.getI32IntegerAttr(0));
}
builder.create<gpu::BarrierOp>(alloc->getLoc());
}

void packSharedMemoryAlloc(func::FuncOp funcOp) {
DominanceInfo dominators(funcOp);
SmallVector<Operation *> allocs;
funcOp.walk([&](memref::AllocOp alloc) {
if (hasSharedMemoryAddressSpace(alloc.getType())) {
allocs.push_back(alloc);
}
});
// First sink the alloc as low as possible in the CFG.
sinkOpsInCFG(allocs, dominators);
SmallVector<AliasGroup> aliasGroups;
analyseAllocsForPacking(funcOp, allocs, aliasGroups);
// If there is 1 or less alias group there is nothing to do.
if (aliasGroups.size() <= 1)
return;

// Pack all the allocations into one i8 alloc.
// We may need to add extra barriers to make sure we are done writting or
// reading from the previous alias group before starting a new one.
for (size_t i = 0; i < aliasGroups.size(); i++) {
for (Operation *alloc : aliasGroups[i]) {
addBarrier(funcOp, alloc, aliasGroups[i]);
}
}

OpBuilder builder(funcOp.getContext());
packAllocs(builder, funcOp, aliasGroups);
}

} // namespace iree_compiler
} // namespace mlir
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ void doLayoutAnalysisAndDistribution(RewriterBase &rewriter,
/// Function to reorder transposes and elementwise ops.
void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp);

/// Look for allocs in shared memory space with overlapping liveness,
/// group them, and then pack all the allocations in each group into one i8
/// alloc.
///
/// Also adds barriers to make sure we are done writing/reading
/// from the previous alias group before starting a new one.
void packSharedMemoryAlloc(func::FuncOp funcOp);

} // namespace iree_compiler
} // namespace mlir

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ iree_lit_test_suite(
"transform_dialect_vector_distribution.mlir",
"transform_dialect_bufferize.mlir",
"transform_dialect_eliminate_gpu_barriers.mlir",
"transform_dialect_pack_shared_memory_alloc.mlir",
"transform_dialect_promote_operands.mlir",
"transform_distribute_forall.mlir",
"transform_gpu_pipelining.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ iree_lit_test_suite(
"transform_dialect_bufferize.mlir"
"transform_dialect_eliminate_gpu_barriers.mlir"
"transform_dialect_hoist_allocs.mlir"
"transform_dialect_pack_shared_memory_alloc.mlir"
"transform_dialect_promote_operands.mlir"
"transform_dialect_vector_distribution.mlir"
"transform_distribute_forall.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// RUN: iree-opt %s --iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s

// CHECK-LABEL: shared_memory_disjoint
// CHECK-NOT: gpu.barrier
// CHECK-DAG: %[[PACKED:.+]] = memref.alloc() : memref<1024xi8, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
// CHECK: %[[C512:.+]] = arith.constant 512 : index
// CHECK: memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
// CHECK: nvgpu.device_async_create_group
// CHECK: nvgpu.device_async_wait %0 {numGroups = 0 : i32}
// CHECK: gpu.barrier
// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<32xf32, #gpu.address_space<workgroup>>
func.func @shared_memory_disjoint() {
%c0 = arith.constant 0 : index
%cst_f32 = arith.constant 0.000000e+00 : f32
%cst_i8 = arith.constant 0 : i8
%0 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
%1 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
%2 = memref.alloc() : memref<32xf32, #gpu.address_space<workgroup>>
memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
memref.store %cst_f32, %1[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
memref.store %cst_f32, %2[%c0] : memref<32xf32, #gpu.address_space<workgroup>>
return
}

transform.sequence failures(propagate) {
^bb1(%arg1: !transform.any_op):
%0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> ()
transform.iree.apply_cse %0 : !transform.any_op
}

0 comments on commit fb8263f

Please sign in to comment.