forked from iree-org/iree
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Codegen][GPU] Add pass to annotate memory spaces on allocations (ire…
…e-org#18251) Trying to infer the memory space of an allocation from within the bufferization alloc callback function is too late. This adds a rudimentary pass to annotate the memory space in obvious situations and then disallows all cases of a bufferization allocation without an already pre-determined memory space (for the LLVMGPUTileAndFuse pipeline). This gives us correctness guarantees that were somewhat hand wavy before. This makes all allocations that aren't marked explicitly as shared (or can be obviously inferred as shared) as thread local. Any previous lowerings that violate this invariant is a bug (most likely from a failure to tile an operation).
- Loading branch information
Showing
13 changed files
with
238 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
// Copyright 2024 The IREE Authors | ||
// | ||
// Licensed under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
#include "iree/compiler/Codegen/Common/GPU/Passes.h" | ||
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" | ||
#include "iree/compiler/Codegen/Utils/GPUUtils.h" | ||
#include "llvm/ADT/STLExtras.h" | ||
#include "mlir/Dialect/Bufferization/IR/Bufferization.h" | ||
#include "mlir/Dialect/GPU/IR/GPUDialect.h" | ||
#include "mlir/IR/Matchers.h" | ||
#include "mlir/IR/Visitors.h" | ||
#include "mlir/Interfaces/FunctionInterfaces.h" | ||
|
||
namespace mlir::iree_compiler { | ||
|
||
#define GEN_PASS_DEF_GPUINFERMEMORYSPACEPASS | ||
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc" | ||
|
||
namespace { | ||
|
||
/// Pass to infer the memory spaces of unmarked `bufferization.alloc_tensor` | ||
/// ops. Inferring the memory space during bufferization (in the allocation | ||
/// function) is infeasible due to some limited analysis of surrounding loop | ||
/// structures needed. After this pass, any unexpected allocations are then | ||
/// treated as a compiler failure indicating something went wrong during | ||
/// bufferization. | ||
struct GPUInferMemorySpacePass final | ||
: impl::GPUInferMemorySpacePassBase<GPUInferMemorySpacePass> { | ||
|
||
void runOnOperation() override; | ||
}; | ||
|
||
bool isDefinitelyShared(bufferization::AllocTensorOp alloc) { | ||
// An allocation can be inferred as shared if it is the destination of a | ||
// thread distributed `scf.forall` op. All other shared allocations are | ||
// expected to be properly indicated in advance. | ||
for (auto user : alloc->getUsers()) { | ||
auto forallOp = dyn_cast<scf::ForallOp>(user); | ||
if (!forallOp || | ||
!forallOpHasMappingType<gpu::GPUThreadMappingAttr, | ||
gpu::GPUWarpMappingAttr>(forallOp)) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
void GPUInferMemorySpacePass::runOnOperation() { | ||
MLIRContext *context = &getContext(); | ||
FunctionOpInterface funcOp = getOperation(); | ||
|
||
gpu::AddressSpaceAttr privateAddressSpace = gpu::AddressSpaceAttr::get( | ||
context, gpu::GPUDialect::getPrivateAddressSpace()); | ||
gpu::AddressSpaceAttr sharedAddressSpace = gpu::AddressSpaceAttr::get( | ||
context, gpu::GPUDialect::getWorkgroupAddressSpace()); | ||
|
||
WalkResult res = funcOp.walk([&](bufferization::AllocTensorOp alloc) { | ||
// Continue if the allocation already has a valid memory space. | ||
std::optional<Attribute> currentMemSpace = alloc.getMemorySpace(); | ||
if (currentMemSpace.has_value()) { | ||
if (currentMemSpace.value() == privateAddressSpace || | ||
currentMemSpace.value() == sharedAddressSpace) { | ||
return WalkResult::advance(); | ||
} | ||
alloc.emitOpError( | ||
"unexpected gpu memory space must be private or workgroup."); | ||
return WalkResult::interrupt(); | ||
} | ||
|
||
/// Determining GPU memory spaces must be trivial by the time of this pass. | ||
/// Because this pass runs immediately before bufferization, input IR is | ||
/// expected to mix (thread) distributed and shared contexts. Because after | ||
/// bufferization distributed loops (scf.forall) ops are expected to be | ||
/// inlined as-is with no further tiling occurring, all tensors at this | ||
/// point in the IR are assumed to be thread-local unless it is explicitly | ||
/// marked as shared. This gives the following invariants: | ||
/// | ||
/// 1. If the alloc_tensor is annotated with `#gpu.address_space<private>` | ||
/// already, or if it is used as the immediate destination of a thread | ||
/// or warp distributed `scf.forall` op, then the allocation must be | ||
/// shared memory. | ||
/// 2. All other allocations are thread local. | ||
/// | ||
/// Any allocation that is not explicitly marked as shared memory that is | ||
/// supposed to be indicates a bug in earlier passes/lowerings. | ||
if (isDefinitelyShared(alloc)) { | ||
alloc.setMemorySpaceAttr(sharedAddressSpace); | ||
} else { | ||
alloc.setMemorySpaceAttr(privateAddressSpace); | ||
} | ||
return WalkResult::advance(); | ||
}); | ||
|
||
if (res.wasInterrupted()) { | ||
funcOp->emitOpError("failed to set the gpu memory space for all " | ||
"`bufferization.alloc_tensor` ops"); | ||
return signalPassFailure(); | ||
} | ||
} | ||
|
||
} // namespace | ||
|
||
} // namespace mlir::iree_compiler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
54 changes: 54 additions & 0 deletions
54
compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_infer_memory_space.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
// RUN: iree-opt %s --split-input-file --verify-diagnostics \ | ||
// RUN: --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-infer-memory-space))" | FileCheck %s | ||
|
||
func.func @write_in_lane_forall(%dest : tensor<4x3xi32>) -> tensor<4x3xi32> { | ||
%alloc = bufferization.alloc_tensor() : tensor<2x3xi32> | ||
%cst = arith.constant dense<0> : vector<2x3xi32> | ||
%c0 = arith.constant 0 : index | ||
%res = scf.forall (%arg0) in (2) shared_outs(%arg1 = %dest) -> tensor<4x3xi32> { | ||
%w = vector.transfer_write %cst, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<2x3xi32>, tensor<2x3xi32> | ||
scf.forall.in_parallel { | ||
tensor.parallel_insert_slice %w into %arg1[%arg0, 0] [2, 3] [1, 1] : tensor<2x3xi32> into tensor<4x3xi32> | ||
} | ||
} {mapping = [#iree_gpu.lane_id<0>]} | ||
return %res : tensor<4x3xi32> | ||
} | ||
|
||
// CHECK: func @write_in_lane_forall | ||
// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} | ||
// CHECK: vector.transfer_write %{{.*}}, %[[ALLOC]] | ||
|
||
// ----- | ||
|
||
func.func @forall_shared_dest(%w : tensor<2x3xi32>) -> tensor<4x3xi32> { | ||
%dest = bufferization.alloc_tensor() : tensor<4x3xi32> | ||
%res = scf.forall (%arg0) in (2) shared_outs(%arg1 = %dest) -> tensor<4x3xi32> { | ||
scf.forall.in_parallel { | ||
tensor.parallel_insert_slice %w into %arg1[%arg0, 0] [2, 3] [1, 1] : tensor<2x3xi32> into tensor<4x3xi32> | ||
} | ||
} {mapping = [#gpu.warp<x>]} | ||
return %res : tensor<4x3xi32> | ||
} | ||
|
||
// CHECK: func @forall_shared_dest | ||
// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} | ||
// CHECK: scf.forall {{.*}} shared_outs(%{{.*}} = %[[ALLOC]]) | ||
|
||
// ----- | ||
|
||
func.func @already_annotated_alloc() -> tensor<2x3xi32> { | ||
%alloc = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} : tensor<2x3xi32> | ||
return %alloc : tensor<2x3xi32> | ||
} | ||
|
||
// CHECK: func @already_annotated_alloc | ||
// CHECK: bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} | ||
|
||
// ----- | ||
|
||
// expected-error@+1 {{failed to set the gpu memory space for all `bufferization.alloc_tensor` ops}} | ||
func.func @unknown_memory_space() -> tensor<2x3xi32> { | ||
// expected-error@+1 {{unexpected gpu memory space must be private or workgroup.}} | ||
%alloc = bufferization.alloc_tensor() {memory_space = "bad"} : tensor<2x3xi32> | ||
return %alloc : tensor<2x3xi32> | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters