Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DmaLoopSubsumption] Relax circular dma loop subsumption condition #826

Merged
merged 6 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
#include "iree-amd-aie/IR/AMDAIEDialect.h"
#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/Transforms/Transforms.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#define DEBUG_TYPE "iree-amdaie-canonicalize-doubly-strided-dma"

Expand All @@ -17,90 +19,105 @@ namespace {

/// Recognize linear accesses across multiple DMA access dimensions and fold
/// them.
LogicalResult foldDmaOpLinearDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes, newSourceStrides,
newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldLinearDims(op.getContext(), sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldLinearDims(op.getContext(), targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpLinearDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes,
newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes = foldLinearDims(
op.getContext(), sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes = foldLinearDims(
op.getContext(), targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

/// Fold single dimension linear accesses and make them implicit.
LogicalResult foldDmaOpSingleDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
LogicalResult sourceRes =
foldSingleDim(sourceOffsets, sourceSizes, sourceStrides);
LogicalResult targetRes =
foldSingleDim(targetOffsets, targetSizes, targetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpSingleDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
LogicalResult sourceRes =
foldSingleDim(sourceOffsets, sourceSizes, sourceStrides);
LogicalResult targetRes =
foldSingleDim(targetOffsets, targetSizes, targetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets,
sourceSizes, sourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets,
sourceSizes, sourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

/// Fold unit dimensions within a strided access pattern.
LogicalResult foldDmaOpUnitDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes, newSourceStrides,
newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldUnitDims(sourceOffsets, sourceSizes, sourceStrides, newSourceOffsets,
newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldUnitDims(targetOffsets, targetSizes, targetStrides, newTargetOffsets,
newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpUnitDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes,
newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldUnitDims(sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldUnitDims(targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

class AMDAIECanonicalizeDoublyStridedOpPass
: public impl::AMDAIECanonicalizeDoublyStridedOpBase<
Expand All @@ -121,30 +138,28 @@ class AMDAIECanonicalizeDoublyStridedOpPass

void AMDAIECanonicalizeDoublyStridedOpPass::runOnOperation() {
Operation *parentOp = getOperation();
IRRewriter rewriter(parentOp->getContext());

// Fold DMA unit dimensions. Needs to happen before folding linear dimensions
// to avoid blocking detection of linear dimension folding opportunities due
// to a unit dimension in between.
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpUnitDims(rewriter, dmaOp);
});
MLIRContext *context = &getContext();
RewritePatternSet patterns(context);

populateCanonicalizeDoublyStridedOpPatterns(patterns, foldSingleDims);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError(
"failed to canonicalize doubly strided DMA operations");
return signalPassFailure();
}
}

// Fold linear dimensions within a DMA op.
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpLinearDims(rewriter, dmaOp);
});
} // namespace

// Make DMA accesses with single dimension implicit.
void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns,
bool foldSingleDims) {
patterns.add<FoldDmaOpUnitDims>(patterns.getContext());
patterns.add<FoldDmaOpLinearDims>(patterns.getContext());
if (foldSingleDims) {
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpSingleDims(rewriter, dmaOp);
});
patterns.add<FoldDmaOpSingleDims>(patterns.getContext());
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
AMDAIECanonicalizeDoublyStridedOpOptions options) {
return std::make_unique<AMDAIECanonicalizeDoublyStridedOpPass>(options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ void AMDAIEDmaCompositionPass::runOnOperation() {
onlyZeroStrideOnOuterDim);
}
populateStridedOpCombinationPattern(patterns);
populateCanonicalizeDoublyStridedOpPatterns(patterns, false);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError("failed to compose strided operations");
return signalPassFailure();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ struct SubsumeLoopIntoDMA
if (!isa<LoopLikeOpInterface>(parentOp))
return rewriter.notifyMatchFailure(op, "Parent is not a loop-like op");

auto hasUsersInSameScope = [&](Value result) -> bool {
auto hasOtherUsersInSameScope = [&](Value result) -> bool {
for (Operation *userOp : result.getUsers()) {
if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) {
return true;
Expand All @@ -501,6 +501,25 @@ struct SubsumeLoopIntoDMA
return false;
};

auto hasCircularUsersInSameScope =
yzhang93 marked this conversation as resolved.
Show resolved Hide resolved
[&](SmallVector<AMDAIE::DoublyStridedOpInterface> users) -> bool {
bool currentCircularDma = false;
for (AMDAIE::DoublyStridedOpInterface userOp : llvm::reverse(users)) {
// Check if there is other circular dma user in the same scope.
if (isa<AMDAIE::NpuCircularDmaCpyNdOp>(userOp) &&
userOp != op.getOperation()) {
return true;
}
// Check if there is other user before the current in the same scope.
if (userOp == op.getOperation()) {
currentCircularDma = true;
continue;
}
if (currentCircularDma) return true;
}
return false;
};

uint8_t sourceMemspaceInt;
uint8_t targetMemspaceInt;
if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op.getOperation())) {
Expand All @@ -526,7 +545,7 @@ struct SubsumeLoopIntoDMA
"merged with other connections, so abort loop subsumption as it "
"could potentially lead to deadlocks");
}
if (hasUsersInSameScope(connectionOp.getResult())) {
if (hasOtherUsersInSameScope(connectionOp.getResult())) {
return rewriter.notifyMatchFailure(
op,
"Has users of same DMA in scope, analysis to check validity of "
Expand All @@ -538,16 +557,28 @@ struct SubsumeLoopIntoDMA
sourceMemspaceInt = npuCircularDmaOp.getSourceMemorySpaceAsUInt();
targetMemspaceInt = npuCircularDmaOp.getTargetMemorySpaceAsUInt();

// Check that the connection this `amdaie.npu.dma_cpy_nd` operation is
// operating on, is not being touched within the same scope. Otherwise,
// the rewrite is not valid in general as it would be changing the
// temporal usage of the source connection.
// Check that the connection this `amdaie.npu.circular_dma_cpy_nd` op is
// operating on, satisfies the following conditions:
// 1) No other users of the connection has the Circular trait in the same
// scope; 2) No other users of the connection before this circular dma op
// in the same scope. Otherwise, the rewrite is not valid in general as it
// would be changing the temporal usage of the source connection.
AMDAIE::ConnectionOp connectionOp = npuCircularDmaOp.getConnectionOp();
if (!connectionOp) {
return rewriter.notifyMatchFailure(
op, "should operate on an `amdaie.connection` op");
}
if (hasUsersInSameScope(connectionOp.getResult())) {
// Walk all dma ops in order and get those which are the users of the
// current connection op.
SmallVector<AMDAIE::DoublyStridedOpInterface> dmaUsers;
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface op) {
auto dmaConnection = dyn_cast_if_present<AMDAIE::ConnectionOp>(
op->getOperand(0).getDefiningOp());
if (dmaConnection && dmaConnection == connectionOp) {
dmaUsers.push_back(op);
}
});
if (hasCircularUsersInSameScope(dmaUsers)) {
return rewriter.notifyMatchFailure(
op,
"Has users of same DMA in scope, analysis to check validity of "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,8 @@ void buildAMDAIETransformPassPipeline(
}
modulePassManager.addPass(createLowerUKernelOpsToCallsPass());
if (useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) {
addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow);
addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow,
useTilePipeline);
} else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) {
addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline,
matmulElementwiseFusion);
Expand All @@ -541,11 +542,22 @@ void buildAMDAIETransformPassPipeline(
}

void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow) {
bool enablePacketFlow,
TilePassPipeline useTilePipeline) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEConvertToDmaPass());
// For matmul pipelines, we do transpose on target side for pack ops to get
// better performance. While for convolution pipelines, the same settings
// cause 'aie.dma_bd' error, so for now keep using transpose on source for
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the exact error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error is 'aie.dma_bd' op Cannot give more than 3 dimensions for step sizes and wraps in this tile (got 4 dimensions). The full dump IR is https://gist.github.com/yzhang93/fd34425cc223eccbd233c7d1051f2d63

// both pack and unpack ops.
// TODO(vivian): explore the other options for conv ops.
AMDAIEConvertToDmaOptions dmaOptions;
dmaOptions.packTransposeOnSource =
(useTilePipeline == TilePassPipeline::ConvDecomposePipeline) ? true
: false;
dmaOptions.unpackTransposeOnSource = true;
passManager.addPass(createAMDAIEConvertToDmaPass(dmaOptions));

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
passManager.addPass(createAMDAIEInsertCoresPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ namespace mlir::iree_compiler::AMDAIE {

/// Add passes to lower to AIE objectFifos.
void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow);
bool enablePacketFlow,
TilePassPipeline useTilePipeline);

/// Add passes to lower from MLIR-AIR through AIE. This is
/// currently the default passes used for lowering after IREEs tiling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp);
LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
scf::ForallOp forallOp);

/// Populate patterns that canonicalize doubly strided DMA operations.
void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns,
bool foldSingleDims);

/// Populate patterns that subsume loops iterations into DMA access patterns.
void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns,
AMDAIE::AMDAIEDeviceModel &&deviceModel,
Expand Down
Loading
Loading