diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp index 7b2e880dc2a2..79152a9b36dd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp @@ -9,54 +9,11 @@ #include "iree/compiler/Codegen/LLVMGPU/PassDetail.h" #include "iree/compiler/Codegen/LLVMGPU/Passes.h" #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h" -#include "iree/compiler/Codegen/Transforms/Transforms.h" -#include "iree/compiler/Codegen/Utils/GPUUtils.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" -#include "mlir/IR/Dominance.h" namespace mlir { namespace iree_compiler { -/// Insert barriers and wait operations if there are allocs of a different alias -/// group before the given alloc. -static void addBarrier(func::FuncOp funcOp, Operation *alloc, - ArrayRef aliasGroup) { - Block *entryBlock = &(*funcOp.getBlocks().begin()); - bool needBarrier = false; - if (alloc->getBlock() != entryBlock) { - needBarrier = true; - } else { - for (Operation &op : entryBlock->getOperations()) { - if (&op == alloc) - break; - if (op.getNumRegions() != 0) { - needBarrier = true; - break; - } - if (isa(&op) && !llvm::is_contained(aliasGroup, &op)) { - needBarrier = true; - break; - } - } - } - if (!needBarrier) - return; - OpBuilder builder(alloc); - // TODO: make it a option if needed. - bool hasAsyncCopies = true; - if (hasAsyncCopies) { - Value groupToken = builder.create( - funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()), - SmallVector()); - builder.create(funcOp.getLoc(), groupToken, - builder.getI32IntegerAttr(0)); - } - builder.create(alloc->getLoc()); -} - namespace { struct LLVMGPUPackSharedMemoryAllocPass @@ -67,35 +24,7 @@ struct LLVMGPUPackSharedMemoryAllocPass registry.insert(); } - void runOnOperation() override { - func::FuncOp funcOp = getOperation(); - DominanceInfo dominators(funcOp); - SmallVector allocs; - funcOp.walk([&](memref::AllocOp alloc) { - if (hasSharedMemoryAddressSpace(alloc.getType())) { - allocs.push_back(alloc); - } - }); - // First sink the alloc as low as possible in the CFG. - sinkOpsInCFG(allocs, dominators); - SmallVector aliasGroups; - analyseAllocsForPacking(funcOp, allocs, aliasGroups); - // If there is 1 or less alias group there is nothing to do. - if (aliasGroups.size() <= 1) - return; - - // Pack all the allocations into one i8 alloc. - // We may need to add extra barriers to make sure we are done writting or - // reading from the previous alias group before starting a new one. - for (size_t i = 0; i < aliasGroups.size(); i++) { - for (Operation *alloc : aliasGroups[i]) { - addBarrier(funcOp, alloc, aliasGroups[i]); - } - } - - OpBuilder builder(funcOp.getContext()); - packAllocs(builder, funcOp, aliasGroups); - } + void runOnOperation() override { packSharedMemoryAlloc(getOperation()); } }; } // namespace diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h index 1d3e79e15bf7..c2c5245086ae 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h @@ -18,6 +18,10 @@ namespace mlir { namespace iree_compiler { +//===----------------------------------------------------------------------===// +// Passes +//===----------------------------------------------------------------------===// + /// Lowering using SIMT CUDA core operations. void addGPUMatmulSimtPassPipeline(OpPassManager &pm); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp index 8a8431855bb0..83b38444f237 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp @@ -48,6 +48,7 @@ using llvm::dbgs; #define DBGS_VECTOR_TO_MMA() (dbgs() << '[' << DEBUG_VECTOR_TO_MMA << "] ") using namespace mlir; +using namespace mlir::iree_compiler; using namespace mlir::iree_compiler::IREE; iree_compiler::IREE::transform_dialect::LLVMGPUExtensions::LLVMGPUExtensions() { @@ -1478,5 +1479,20 @@ transform_dialect::EliminateGpuBarriersOp::applyToOne( return DiagnosedSilenceableFailure::success(); } +DiagnosedSilenceableFailure +transform_dialect::PackSharedMemoryAllocOp::applyToOne( + transform::TransformRewriter &rewriter, func::FuncOp target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + packSharedMemoryAlloc(target); + return DiagnosedSilenceableFailure::success(); +} + +void transform_dialect::PackSharedMemoryAllocOp::getEffects( + SmallVectorImpl &effects) { + transform::onlyReadsHandle(getTarget(), effects); + transform::modifiesPayload(effects); +} + #define GET_OP_CLASSES #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td index 718a82d84d51..9c2f6ee94c56 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td @@ -648,4 +648,38 @@ def EliminateGpuBarriersOp : }]; } +def PackSharedMemoryAllocOp : Op, + TransformEachOpTrait, + TransformOpInterface, + ReportTrackingListenerFailuresOpTrait]> { + let summary = "Pack shared memory allocation to reduce memory usage"; + let description = [{ + Looks for allocs in shared memory space with overlapping liveness and + groups them, then packs all the allocations in each group into one i8 + alloc. Also adds barriers to make sure we are done writing/reading + from the previous alias group before starting a new one. + + #### Return modes + + It does not consume the target handle and always return success. + }]; + + let arguments = ( + ins TransformHandleTypeInterface:$target + ); + let results = (outs); + + let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)"; + let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect"; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::func::FuncOp funcOp, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + #endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel index c139d79f0104..62e6766ff4be 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel @@ -24,6 +24,7 @@ iree_compiler_cc_library( "LLVMGPUUtils.h", ], deps = [ + "//compiler/src/iree/compiler/Codegen/Transforms", "//compiler/src/iree/compiler/Codegen/Utils", "@llvm-project//llvm:Support", "@llvm-project//mlir:AffineDialect", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt index dfb1223564e2..8609095598b5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt @@ -29,6 +29,7 @@ iree_cc_library( MLIRMemRefDialect MLIRNVGPUDialect MLIRVectorDialect + iree::compiler::Codegen::Transforms iree::compiler::Codegen::Utils PUBLIC ) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp index 824251be55c6..997f15063d8a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h" +#include "iree/compiler/Codegen/Transforms/Transforms.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -348,5 +349,71 @@ void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp) { } } +/// Insert barriers and wait operations if there are allocs of a different alias +/// group before the given alloc. +static void addBarrier(func::FuncOp funcOp, Operation *alloc, + ArrayRef aliasGroup) { + Block *entryBlock = &(*funcOp.getBlocks().begin()); + bool needBarrier = false; + if (alloc->getBlock() != entryBlock) { + needBarrier = true; + } else { + for (Operation &op : entryBlock->getOperations()) { + if (&op == alloc) + break; + if (op.getNumRegions() != 0) { + needBarrier = true; + break; + } + if (isa(&op) && !llvm::is_contained(aliasGroup, &op)) { + needBarrier = true; + break; + } + } + } + if (!needBarrier) + return; + OpBuilder builder(alloc); + // TODO: make it a option if needed. + bool hasAsyncCopies = true; + if (hasAsyncCopies) { + Value groupToken = builder.create( + funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()), + SmallVector()); + builder.create(funcOp.getLoc(), groupToken, + builder.getI32IntegerAttr(0)); + } + builder.create(alloc->getLoc()); +} + +void packSharedMemoryAlloc(func::FuncOp funcOp) { + DominanceInfo dominators(funcOp); + SmallVector allocs; + funcOp.walk([&](memref::AllocOp alloc) { + if (hasSharedMemoryAddressSpace(alloc.getType())) { + allocs.push_back(alloc); + } + }); + // First sink the alloc as low as possible in the CFG. + sinkOpsInCFG(allocs, dominators); + SmallVector aliasGroups; + analyseAllocsForPacking(funcOp, allocs, aliasGroups); + // If there is 1 or less alias group there is nothing to do. + if (aliasGroups.size() <= 1) + return; + + // Pack all the allocations into one i8 alloc. + // We may need to add extra barriers to make sure we are done writting or + // reading from the previous alias group before starting a new one. + for (size_t i = 0; i < aliasGroups.size(); i++) { + for (Operation *alloc : aliasGroups[i]) { + addBarrier(funcOp, alloc, aliasGroups[i]); + } + } + + OpBuilder builder(funcOp.getContext()); + packAllocs(builder, funcOp, aliasGroups); +} + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h index 2c96986f53cc..ff6d3f82d16c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h @@ -25,6 +25,14 @@ void doLayoutAnalysisAndDistribution(RewriterBase &rewriter, /// Function to reorder transposes and elementwise ops. void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp); +/// Look for allocs in shared memory space with overlapping liveness, +/// group them, and then pack all the allocations in each group into one i8 +/// alloc. +/// +/// Also adds barriers to make sure we are done writing/reading +/// from the previous alias group before starting a new one. +void packSharedMemoryAlloc(func::FuncOp funcOp); + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 86fbf96d8eb5..6fd983323e53 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -50,6 +50,7 @@ iree_lit_test_suite( "transform_dialect_vector_distribution.mlir", "transform_dialect_bufferize.mlir", "transform_dialect_eliminate_gpu_barriers.mlir", + "transform_dialect_pack_shared_memory_alloc.mlir", "transform_dialect_promote_operands.mlir", "transform_distribute_forall.mlir", "transform_gpu_pipelining.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index b3c7270f3fa7..13430dfaef13 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -45,6 +45,7 @@ iree_lit_test_suite( "transform_dialect_bufferize.mlir" "transform_dialect_eliminate_gpu_barriers.mlir" "transform_dialect_hoist_allocs.mlir" + "transform_dialect_pack_shared_memory_alloc.mlir" "transform_dialect_promote_operands.mlir" "transform_dialect_vector_distribution.mlir" "transform_distribute_forall.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir new file mode 100644 index 000000000000..da0e7bc69a25 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir @@ -0,0 +1,33 @@ +// RUN: iree-opt %s --iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s + +// CHECK-LABEL: shared_memory_disjoint +// CHECK-NOT: gpu.barrier +// CHECK-DAG: %[[PACKED:.+]] = memref.alloc() : memref<1024xi8, #gpu.address_space> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space> to memref<128xf32, #gpu.address_space> +// CHECK: %[[C512:.+]] = arith.constant 512 : index +// CHECK: memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space> to memref<128xf32, #gpu.address_space> +// CHECK: nvgpu.device_async_create_group +// CHECK: nvgpu.device_async_wait %0 {numGroups = 0 : i32} +// CHECK: gpu.barrier +// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space> to memref<32xf32, #gpu.address_space> +func.func @shared_memory_disjoint() { + %c0 = arith.constant 0 : index + %cst_f32 = arith.constant 0.000000e+00 : f32 + %cst_i8 = arith.constant 0 : i8 + %0 = memref.alloc() : memref<128xf32, #gpu.address_space> + %1 = memref.alloc() : memref<128xf32, #gpu.address_space> + %2 = memref.alloc() : memref<32xf32, #gpu.address_space> + memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space> + memref.store %cst_f32, %1[%c0] : memref<128xf32, #gpu.address_space> + memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space> + memref.store %cst_f32, %2[%c0] : memref<32xf32, #gpu.address_space> + return +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> () + transform.iree.apply_cse %0 : !transform.any_op +}