[Transform] Add transform.iree.pack_shared_memory_alloc (#14503)

This patch adds the `transform.iree.pack_shared_memory_alloc` op. This op simply takes in a funcOp and applies LLVMGPUPackSharedMemoryAlloc pass on it.
iree-org · Aug 2, 2023 · fb8263f · fb8263f
1 parent b9c0623
commit fb8263f
Show file tree

Hide file tree

Showing 11 changed files with 167 additions and 72 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
@@ -9,54 +9,11 @@
 #include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/GPUUtils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/IR/Dominance.h"
 
 namespace mlir {
 namespace iree_compiler {
 
-/// Insert barriers and wait operations if there are allocs of a different alias
-/// group before the given alloc.
-static void addBarrier(func::FuncOp funcOp, Operation *alloc,
-                       ArrayRef<Operation *> aliasGroup) {
-  Block *entryBlock = &(*funcOp.getBlocks().begin());
-  bool needBarrier = false;
-  if (alloc->getBlock() != entryBlock) {
-    needBarrier = true;
-  } else {
-    for (Operation &op : entryBlock->getOperations()) {
-      if (&op == alloc)
-        break;
-      if (op.getNumRegions() != 0) {
-        needBarrier = true;
-        break;
-      }
-      if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
-        needBarrier = true;
-        break;
-      }
-    }
-  }
-  if (!needBarrier)
-    return;
-  OpBuilder builder(alloc);
-  // TODO: make it a option if needed.
-  bool hasAsyncCopies = true;
-  if (hasAsyncCopies) {
-    Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
-        funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
-        SmallVector<Value>());
-    builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
-                                             builder.getI32IntegerAttr(0));
-  }
-  builder.create<gpu::BarrierOp>(alloc->getLoc());
-}
-
 namespace {
 
 struct LLVMGPUPackSharedMemoryAllocPass
@@ -67,35 +24,7 @@ struct LLVMGPUPackSharedMemoryAllocPass
     registry.insert<nvgpu::NVGPUDialect>();
   }
 
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-    DominanceInfo dominators(funcOp);
-    SmallVector<Operation *> allocs;
-    funcOp.walk([&](memref::AllocOp alloc) {
-      if (hasSharedMemoryAddressSpace(alloc.getType())) {
-        allocs.push_back(alloc);
-      }
-    });
-    // First sink the alloc as low as possible in the CFG.
-    sinkOpsInCFG(allocs, dominators);
-    SmallVector<AliasGroup> aliasGroups;
-    analyseAllocsForPacking(funcOp, allocs, aliasGroups);
-    // If there is 1 or less alias group there is nothing to do.
-    if (aliasGroups.size() <= 1)
-      return;
-
-    // Pack all the allocations into one i8 alloc.
-    // We may need to add extra barriers to make sure we are done writting or
-    // reading from the previous alias group before starting a new one.
-    for (size_t i = 0; i < aliasGroups.size(); i++) {
-      for (Operation *alloc : aliasGroups[i]) {
-        addBarrier(funcOp, alloc, aliasGroups[i]);
-      }
-    }
-
-    OpBuilder builder(funcOp.getContext());
-    packAllocs(builder, funcOp, aliasGroups);
-  }
+  void runOnOperation() override { packSharedMemoryAlloc(getOperation()); }
 };
 } // namespace
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -18,6 +18,10 @@
 namespace mlir {
 namespace iree_compiler {
 
+//===----------------------------------------------------------------------===//
+// Passes
+//===----------------------------------------------------------------------===//
+
 /// Lowering using SIMT CUDA core operations.
 void addGPUMatmulSimtPassPipeline(OpPassManager &pm);
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -48,6 +48,7 @@ using llvm::dbgs;
 #define DBGS_VECTOR_TO_MMA() (dbgs() << '[' << DEBUG_VECTOR_TO_MMA << "] ")
 
 using namespace mlir;
+using namespace mlir::iree_compiler;
 using namespace mlir::iree_compiler::IREE;
 
 iree_compiler::IREE::transform_dialect::LLVMGPUExtensions::LLVMGPUExtensions() {
@@ -1478,5 +1479,20 @@ transform_dialect::EliminateGpuBarriersOp::applyToOne(
   return DiagnosedSilenceableFailure::success();
 }
 
+DiagnosedSilenceableFailure
+transform_dialect::PackSharedMemoryAllocOp::applyToOne(
+    transform::TransformRewriter &rewriter, func::FuncOp target,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  packSharedMemoryAlloc(target);
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform_dialect::PackSharedMemoryAllocOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::onlyReadsHandle(getTarget(), effects);
+  transform::modifiesPayload(effects);
+}
+
 #define GET_OP_CLASSES
 #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -648,4 +648,38 @@ def EliminateGpuBarriersOp :
   }];
 }
 
+def PackSharedMemoryAllocOp :  Op<Transform_Dialect, "iree.pack_shared_memory_alloc",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     TransformEachOpTrait,
+     TransformOpInterface,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let summary = "Pack shared memory allocation to reduce memory usage";
+  let description = [{
+    Looks for allocs in shared memory space with overlapping liveness and
+    groups them, then packs all the allocations in each group into one i8
+    alloc. Also adds barriers to make sure we are done writing/reading
+    from the previous alias group before starting a new one.
+
+    #### Return modes
+
+    It does not consume the target handle and always return success.
+  }];
+
+  let arguments = (
+      ins TransformHandleTypeInterface:$target
+  );
+  let results = (outs);
+
+  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
+  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::func::FuncOp funcOp,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
 #endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
@@ -24,6 +24,7 @@ iree_compiler_cc_library(
         "LLVMGPUUtils.h",
     ],
     deps = [
+        "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
@@ -29,6 +29,7 @@ iree_cc_library(
     MLIRMemRefDialect
     MLIRNVGPUDialect
     MLIRVectorDialect
+    iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
   PUBLIC
 )

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 
+#include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -348,5 +349,71 @@ void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp) {
   }
 }
 
+/// Insert barriers and wait operations if there are allocs of a different alias
+/// group before the given alloc.
+static void addBarrier(func::FuncOp funcOp, Operation *alloc,
+                       ArrayRef<Operation *> aliasGroup) {
+  Block *entryBlock = &(*funcOp.getBlocks().begin());
+  bool needBarrier = false;
+  if (alloc->getBlock() != entryBlock) {
+    needBarrier = true;
+  } else {
+    for (Operation &op : entryBlock->getOperations()) {
+      if (&op == alloc)
+        break;
+      if (op.getNumRegions() != 0) {
+        needBarrier = true;
+        break;
+      }
+      if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
+        needBarrier = true;
+        break;
+      }
+    }
+  }
+  if (!needBarrier)
+    return;
+  OpBuilder builder(alloc);
+  // TODO: make it a option if needed.
+  bool hasAsyncCopies = true;
+  if (hasAsyncCopies) {
+    Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
+        funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
+        SmallVector<Value>());
+    builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
+                                             builder.getI32IntegerAttr(0));
+  }
+  builder.create<gpu::BarrierOp>(alloc->getLoc());
+}
+
+void packSharedMemoryAlloc(func::FuncOp funcOp) {
+  DominanceInfo dominators(funcOp);
+  SmallVector<Operation *> allocs;
+  funcOp.walk([&](memref::AllocOp alloc) {
+    if (hasSharedMemoryAddressSpace(alloc.getType())) {
+      allocs.push_back(alloc);
+    }
+  });
+  // First sink the alloc as low as possible in the CFG.
+  sinkOpsInCFG(allocs, dominators);
+  SmallVector<AliasGroup> aliasGroups;
+  analyseAllocsForPacking(funcOp, allocs, aliasGroups);
+  // If there is 1 or less alias group there is nothing to do.
+  if (aliasGroups.size() <= 1)
+    return;
+
+  // Pack all the allocations into one i8 alloc.
+  // We may need to add extra barriers to make sure we are done writting or
+  // reading from the previous alias group before starting a new one.
+  for (size_t i = 0; i < aliasGroups.size(); i++) {
+    for (Operation *alloc : aliasGroups[i]) {
+      addBarrier(funcOp, alloc, aliasGroups[i]);
+    }
+  }
+
+  OpBuilder builder(funcOp.getContext());
+  packAllocs(builder, funcOp, aliasGroups);
+}
+
 } // namespace iree_compiler
 } // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
@@ -25,6 +25,14 @@ void doLayoutAnalysisAndDistribution(RewriterBase &rewriter,
 /// Function to reorder transposes and elementwise ops.
 void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp);
 
+/// Look for allocs in shared memory space with overlapping liveness,
+/// group them, and then pack all the allocations in each group into one i8
+/// alloc.
+///
+/// Also adds barriers to make sure we are done writing/reading
+/// from the previous alias group before starting a new one.
+void packSharedMemoryAlloc(func::FuncOp funcOp);
+
 } // namespace iree_compiler
 } // namespace mlir
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -50,6 +50,7 @@ iree_lit_test_suite(
             "transform_dialect_vector_distribution.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_eliminate_gpu_barriers.mlir",
+            "transform_dialect_pack_shared_memory_alloc.mlir",
             "transform_dialect_promote_operands.mlir",
             "transform_distribute_forall.mlir",
             "transform_gpu_pipelining.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -45,6 +45,7 @@ iree_lit_test_suite(
     "transform_dialect_bufferize.mlir"
     "transform_dialect_eliminate_gpu_barriers.mlir"
     "transform_dialect_hoist_allocs.mlir"
+    "transform_dialect_pack_shared_memory_alloc.mlir"
     "transform_dialect_promote_operands.mlir"
     "transform_dialect_vector_distribution.mlir"
     "transform_distribute_forall.mlir"

diff --git a/...er/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/...er/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
@@ -0,0 +1,33 @@
+// RUN: iree-opt %s --iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
+
+// CHECK-LABEL: shared_memory_disjoint
+//   CHECK-NOT:   gpu.barrier
+//   CHECK-DAG:   %[[PACKED:.+]] = memref.alloc() : memref<1024xi8, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[C512:.+]] = arith.constant 512 : index
+//       CHECK:   memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
+//       CHECK:   nvgpu.device_async_create_group 
+//       CHECK:   nvgpu.device_async_wait %0 {numGroups = 0 : i32}
+//       CHECK:   gpu.barrier
+//       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<32xf32, #gpu.address_space<workgroup>>
+func.func @shared_memory_disjoint() {
+  %c0 = arith.constant 0 : index
+  %cst_f32 = arith.constant 0.000000e+00 : f32
+  %cst_i8 = arith.constant 0 : i8
+  %0 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+  %1 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+  %2 = memref.alloc() : memref<32xf32, #gpu.address_space<workgroup>>
+  memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+  memref.store %cst_f32, %1[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+  memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+  memref.store %cst_f32, %2[%c0] : memref<32xf32, #gpu.address_space<workgroup>>
+  return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> ()
+  transform.iree.apply_cse %0 : !transform.any_op
+}