From 93570bceb454685e95e4037dc591d37a2420f7b4 Mon Sep 17 00:00:00 2001 From: yzhang93 Date: Thu, 3 Oct 2024 13:52:49 -0700 Subject: [PATCH 1/5] [DmaLoopSubsumption] Relax circular dma loop subsumption condition --- .../Transforms/AMDAIEDmaLoopSubsumption.cpp | 45 +++++++-- .../iree-amd-aie/Transforms/Passes.cpp | 7 +- .../test/dma_loop_subsumption_circular.mlir | 92 +++++++++++++++++++ 3 files changed, 136 insertions(+), 8 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index fc2e872e6..c3cd752dd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -492,7 +492,7 @@ struct SubsumeLoopIntoDMA if (!isa(parentOp)) return rewriter.notifyMatchFailure(op, "Parent is not a loop-like op"); - auto hasUsersInSameScope = [&](Value result) -> bool { + auto hasOtherUsersInSameScope = [&](Value result) -> bool { for (Operation *userOp : result.getUsers()) { if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) { return true; @@ -501,6 +501,25 @@ struct SubsumeLoopIntoDMA return false; }; + auto hasCircularUsersInSameScope = + [&](SmallVector users) -> bool { + bool currentUser = false; + for (AMDAIE::DoublyStridedOpInterface userOp : llvm::reverse(users)) { + // Check if there is other circular dma user in the same scope. + if (isa(userOp) && + userOp != op.getOperation()) { + return true; + } + // Check if there is other user before the current in the same scope. + if (userOp == op.getOperation()) { + currentUser = true; + continue; + } + if (currentUser) return true; + } + return false; + }; + uint8_t sourceMemspaceInt; uint8_t targetMemspaceInt; if (auto npuDmaOp = dyn_cast(op.getOperation())) { @@ -526,7 +545,7 @@ struct SubsumeLoopIntoDMA "merged with other connections, so abort loop subsumption as it " "could potentially lead to deadlocks"); } - if (hasUsersInSameScope(connectionOp.getResult())) { + if (hasOtherUsersInSameScope(connectionOp.getResult())) { return rewriter.notifyMatchFailure( op, "Has users of same DMA in scope, analysis to check validity of " @@ -538,16 +557,28 @@ struct SubsumeLoopIntoDMA sourceMemspaceInt = npuCircularDmaOp.getSourceMemorySpaceAsUInt(); targetMemspaceInt = npuCircularDmaOp.getTargetMemorySpaceAsUInt(); - // Check that the connection this `amdaie.npu.dma_cpy_nd` operation is - // operating on, is not being touched within the same scope. Otherwise, - // the rewrite is not valid in general as it would be changing the - // temporal usage of the source connection. + // Check that the connection this `amdaie.npu.circular_dma_cpy_nd` op is + // operating on, satisfies the following conditions: + // 1) No other users of the connection has the Circular trait in the same + // scope; 2) No other users of the connection before this circular dma op + // in the same scope. Otherwise, the rewrite is not valid in general as it + // would be changing the temporal usage of the source connection. AMDAIE::ConnectionOp connectionOp = npuCircularDmaOp.getConnectionOp(); if (!connectionOp) { return rewriter.notifyMatchFailure( op, "should operate on an `amdaie.connection` op"); } - if (hasUsersInSameScope(connectionOp.getResult())) { + // Walk all dma ops in order and get those which are the users of the + // current connection op. + SmallVector dmaUsers; + parentOp->walk([&](AMDAIE::DoublyStridedOpInterface op) { + auto dmaConnection = dyn_cast_if_present( + op->getOperand(0).getDefiningOp()); + if (dmaConnection && dmaConnection == connectionOp) { + dmaUsers.push_back(op); + } + }); + if (hasCircularUsersInSameScope(dmaUsers)) { return rewriter.notifyMatchFailure( op, "Has users of same DMA in scope, analysis to check validity of " diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b5044f592..b49c480f2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -545,7 +545,10 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIEConvertToDmaPass()); + AMDAIEConvertToDmaOptions dmaOptions; + dmaOptions.packTransposeOnSource = false; + dmaOptions.unpackTransposeOnSource = true; + passManager.addPass(createAMDAIEConvertToDmaPass(dmaOptions)); passManager.addPass(createAMDAIENormalizeLoopBoundsPass()); passManager.addPass(createAMDAIEInsertCoresPass()); @@ -577,6 +580,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCompositionPass()); + passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass()); + passManager.addPass(createAMDAIEDmaLoopSubsumptionPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCSEPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir index e23bef98b..9b4833d16 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir @@ -58,3 +58,95 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// Ensure loop subsumption happens for both npu.circular_dma_cpy_nd and npu.dma_cpy_nd ops. +// CHECK-LABEL: @circular_and_other_dma_subsume +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, 0, 0, 0] [2, 2, 32, 64] [0, 64, 128, 1]) +#map = affine_map<(d0) -> (d0 * 64)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @circular_and_other_dma_subsume(%arg0: !amdaie.logicalobjectfifo, 2>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) + } + amdaie.end + } + } + return + } +} + +// ----- + +// Ensure no subsumption happens, if there is other dma user of the same connection op +// before this npu.circular_dma_cpy_nd op in the same scope. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 64)> +// CHECK-LABEL: @other_dma_before_circular_no_subsume +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 2) +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, %[[APPLY]]] [32, 64] [128, 1]) +// CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) +#map = affine_map<(d0) -> (d0 * 64)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @other_dma_before_circular_no_subsume(%arg0: !amdaie.logicalobjectfifo, 2>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) + %3 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + } + amdaie.end + } + } + return + } +} + +// ----- + +// Ensure no subsumption happens, if there are more than one npu.circular_dma_cpy_nd users +// of the same connection op in the same scope. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 64)> +// CHECK-LABEL: @two_circular_dma_no_subsume +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 2) +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]]) +// CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, %[[APPLY]]] [32, 64] [128, 1]) +// CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) +#map = affine_map<(d0) -> (d0 * 64)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @two_circular_dma_no_subsume(%arg0: !amdaie.logicalobjectfifo, 2>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) + %4 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + } + amdaie.end + } + } + return + } +} \ No newline at end of file From a909e43bb1281f83591dbe97ba985a39bea3d8b6 Mon Sep 17 00:00:00 2001 From: yzhang93 Date: Fri, 4 Oct 2024 13:51:44 -0700 Subject: [PATCH 2/5] Modify test --- .../Transforms/AMDAIEDmaLoopSubsumption.cpp | 6 +++--- .../Transforms/test/dma_loop_subsumption_circular.mlir | 10 ++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index c3cd752dd..742f956d9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -503,7 +503,7 @@ struct SubsumeLoopIntoDMA auto hasCircularUsersInSameScope = [&](SmallVector users) -> bool { - bool currentUser = false; + bool currentCircularDma = false; for (AMDAIE::DoublyStridedOpInterface userOp : llvm::reverse(users)) { // Check if there is other circular dma user in the same scope. if (isa(userOp) && @@ -512,10 +512,10 @@ struct SubsumeLoopIntoDMA } // Check if there is other user before the current in the same scope. if (userOp == op.getOperation()) { - currentUser = true; + currentCircularDma = true; continue; } - if (currentUser) return true; + if (currentCircularDma) return true; } return false; }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir index 9b4833d16..f17e5e00b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir @@ -122,16 +122,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Ensure no subsumption happens, if there are more than one npu.circular_dma_cpy_nd users // of the same connection op in the same scope. -// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 64)> // CHECK-LABEL: @two_circular_dma_no_subsume // CHECK: %[[CONNECTION:.+]] = amdaie.connection // CHECK: amdaie.controlcode // CHECK: scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 2) -// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]]) // CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) -// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, %[[APPLY]]] [32, 64] [128, 1]) // CHECK: amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([0] [2048] [1], [] [] []) -#map = affine_map<(d0) -> (d0 * 64)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @two_circular_dma_no_subsume(%arg0: !amdaie.logicalobjectfifo, 2>, %arg1: !amdaie.logicalobjectfifo>) { @@ -139,14 +135,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { scf.forall (%arg2, %arg3) in (2, 2) { - %1 = affine.apply #map(%arg3) %2 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) - %4 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + %3 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) } amdaie.end } } return } -} \ No newline at end of file +} From 3a066323c333863917f918ccb55894cdf2c382c1 Mon Sep 17 00:00:00 2001 From: yzhang93 Date: Fri, 4 Oct 2024 22:07:17 -0700 Subject: [PATCH 3/5] Add CanonicalizeDoublyStridedOpPatterns --- .../AMDAIECanonicalizeDoublyStridedOp.cpp | 205 ++++++++++-------- .../Transforms/AMDAIEDmaComposition.cpp | 59 +++-- .../iree-amd-aie/Transforms/Passes.cpp | 2 - .../iree-amd-aie/Transforms/Transforms.h | 4 + 4 files changed, 157 insertions(+), 113 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp index bfe55ea42..6b57403aa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp @@ -7,7 +7,9 @@ #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #define DEBUG_TYPE "iree-amdaie-canonicalize-doubly-strided-dma" @@ -17,90 +19,105 @@ namespace { /// Recognize linear accesses across multiple DMA access dimensions and fold /// them. -LogicalResult foldDmaOpLinearDims(RewriterBase &rewriter, - AMDAIE::DoublyStridedOpInterface op) { - OpBuilder::InsertionGuard guard(rewriter); - SmallVector sourceOffsets = op.getSourceMixedOffsets(); - SmallVector sourceSizes = op.getSourceMixedSizes(); - SmallVector sourceStrides = op.getSourceMixedStrides(); - SmallVector targetOffsets = op.getTargetMixedOffsets(); - SmallVector targetSizes = op.getTargetMixedSizes(); - SmallVector targetStrides = op.getTargetMixedStrides(); - SmallVector newSourceOffsets, newSourceSizes, newSourceStrides, - newTargetOffsets, newTargetSizes, newTargetStrides; - LogicalResult sourceRes = - foldLinearDims(op.getContext(), sourceOffsets, sourceSizes, sourceStrides, - newSourceOffsets, newSourceSizes, newSourceStrides); - LogicalResult targetRes = - foldLinearDims(op.getContext(), targetOffsets, targetSizes, targetStrides, - newTargetOffsets, newTargetSizes, newTargetStrides); - if (failed(sourceRes) && failed(targetRes)) { - return failure(); +struct FoldDmaOpLinearDims + : public OpInterfaceRewritePattern { + using OpInterfaceRewritePattern::OpInterfaceRewritePattern; + + LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsets = op.getSourceMixedOffsets(); + SmallVector sourceSizes = op.getSourceMixedSizes(); + SmallVector sourceStrides = op.getSourceMixedStrides(); + SmallVector targetOffsets = op.getTargetMixedOffsets(); + SmallVector targetSizes = op.getTargetMixedSizes(); + SmallVector targetStrides = op.getTargetMixedStrides(); + SmallVector newSourceOffsets, newSourceSizes, + newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides; + LogicalResult sourceRes = foldLinearDims( + op.getContext(), sourceOffsets, sourceSizes, sourceStrides, + newSourceOffsets, newSourceSizes, newSourceStrides); + LogicalResult targetRes = foldLinearDims( + op.getContext(), targetOffsets, targetSizes, targetStrides, + newTargetOffsets, newTargetSizes, newTargetStrides); + if (failed(sourceRes) && failed(targetRes)) { + return failure(); + } + + rewriter.setInsertionPointAfter(op); + auto newDoublyStridedOp = op.createDoublyStridedOp( + rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, + newSourceOffsets, newSourceSizes, newSourceStrides); + rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); + return success(); } - - rewriter.setInsertionPointAfter(op); - auto newDoublyStridedOp = op.createDoublyStridedOp( - rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, - newSourceOffsets, newSourceSizes, newSourceStrides); - rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); - return success(); -} +}; /// Fold single dimension linear accesses and make them implicit. -LogicalResult foldDmaOpSingleDims(RewriterBase &rewriter, - AMDAIE::DoublyStridedOpInterface op) { - OpBuilder::InsertionGuard guard(rewriter); - SmallVector sourceOffsets = op.getSourceMixedOffsets(); - SmallVector sourceSizes = op.getSourceMixedSizes(); - SmallVector sourceStrides = op.getSourceMixedStrides(); - SmallVector targetOffsets = op.getTargetMixedOffsets(); - SmallVector targetSizes = op.getTargetMixedSizes(); - SmallVector targetStrides = op.getTargetMixedStrides(); - LogicalResult sourceRes = - foldSingleDim(sourceOffsets, sourceSizes, sourceStrides); - LogicalResult targetRes = - foldSingleDim(targetOffsets, targetSizes, targetStrides); - if (failed(sourceRes) && failed(targetRes)) { - return failure(); +struct FoldDmaOpSingleDims + : public OpInterfaceRewritePattern { + using OpInterfaceRewritePattern::OpInterfaceRewritePattern; + + LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsets = op.getSourceMixedOffsets(); + SmallVector sourceSizes = op.getSourceMixedSizes(); + SmallVector sourceStrides = op.getSourceMixedStrides(); + SmallVector targetOffsets = op.getTargetMixedOffsets(); + SmallVector targetSizes = op.getTargetMixedSizes(); + SmallVector targetStrides = op.getTargetMixedStrides(); + LogicalResult sourceRes = + foldSingleDim(sourceOffsets, sourceSizes, sourceStrides); + LogicalResult targetRes = + foldSingleDim(targetOffsets, targetSizes, targetStrides); + if (failed(sourceRes) && failed(targetRes)) { + return failure(); + } + + rewriter.setInsertionPointAfter(op); + auto newDoublyStridedOp = op.createDoublyStridedOp( + rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets, + sourceSizes, sourceStrides); + rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); + return success(); } - - rewriter.setInsertionPointAfter(op); - auto newDoublyStridedOp = op.createDoublyStridedOp( - rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets, - sourceSizes, sourceStrides); - rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); - return success(); -} +}; /// Fold unit dimensions within a strided access pattern. -LogicalResult foldDmaOpUnitDims(RewriterBase &rewriter, - AMDAIE::DoublyStridedOpInterface op) { - OpBuilder::InsertionGuard guard(rewriter); - SmallVector sourceOffsets = op.getSourceMixedOffsets(); - SmallVector sourceSizes = op.getSourceMixedSizes(); - SmallVector sourceStrides = op.getSourceMixedStrides(); - SmallVector targetOffsets = op.getTargetMixedOffsets(); - SmallVector targetSizes = op.getTargetMixedSizes(); - SmallVector targetStrides = op.getTargetMixedStrides(); - SmallVector newSourceOffsets, newSourceSizes, newSourceStrides, - newTargetOffsets, newTargetSizes, newTargetStrides; - LogicalResult sourceRes = - foldUnitDims(sourceOffsets, sourceSizes, sourceStrides, newSourceOffsets, - newSourceSizes, newSourceStrides); - LogicalResult targetRes = - foldUnitDims(targetOffsets, targetSizes, targetStrides, newTargetOffsets, - newTargetSizes, newTargetStrides); - if (failed(sourceRes) && failed(targetRes)) { - return failure(); +struct FoldDmaOpUnitDims + : public OpInterfaceRewritePattern { + using OpInterfaceRewritePattern::OpInterfaceRewritePattern; + + LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsets = op.getSourceMixedOffsets(); + SmallVector sourceSizes = op.getSourceMixedSizes(); + SmallVector sourceStrides = op.getSourceMixedStrides(); + SmallVector targetOffsets = op.getTargetMixedOffsets(); + SmallVector targetSizes = op.getTargetMixedSizes(); + SmallVector targetStrides = op.getTargetMixedStrides(); + SmallVector newSourceOffsets, newSourceSizes, + newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides; + LogicalResult sourceRes = + foldUnitDims(sourceOffsets, sourceSizes, sourceStrides, + newSourceOffsets, newSourceSizes, newSourceStrides); + LogicalResult targetRes = + foldUnitDims(targetOffsets, targetSizes, targetStrides, + newTargetOffsets, newTargetSizes, newTargetStrides); + if (failed(sourceRes) && failed(targetRes)) { + return failure(); + } + + rewriter.setInsertionPointAfter(op); + auto newDoublyStridedOp = op.createDoublyStridedOp( + rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, + newSourceOffsets, newSourceSizes, newSourceStrides); + rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); + return success(); } - - rewriter.setInsertionPointAfter(op); - auto newDoublyStridedOp = op.createDoublyStridedOp( - rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, - newSourceOffsets, newSourceSizes, newSourceStrides); - rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); - return success(); -} +}; class AMDAIECanonicalizeDoublyStridedOpPass : public impl::AMDAIECanonicalizeDoublyStridedOpBase< @@ -121,30 +138,28 @@ class AMDAIECanonicalizeDoublyStridedOpPass void AMDAIECanonicalizeDoublyStridedOpPass::runOnOperation() { Operation *parentOp = getOperation(); - IRRewriter rewriter(parentOp->getContext()); - - // Fold DMA unit dimensions. Needs to happen before folding linear dimensions - // to avoid blocking detection of linear dimension folding opportunities due - // to a unit dimension in between. - parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) { - (void)foldDmaOpUnitDims(rewriter, dmaOp); - }); + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + + populateCanonicalizeDoublyStridedOpPatterns(patterns, foldSingleDims); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError( + "failed to canonicalize doubly strided DMA operations"); + return signalPassFailure(); + } +} - // Fold linear dimensions within a DMA op. - parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) { - (void)foldDmaOpLinearDims(rewriter, dmaOp); - }); +} // namespace - // Make DMA accesses with single dimension implicit. +void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns, + bool foldSingleDims) { + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); if (foldSingleDims) { - parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) { - (void)foldDmaOpSingleDims(rewriter, dmaOp); - }); + patterns.add(patterns.getContext()); } } -} // namespace - std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( AMDAIECanonicalizeDoublyStridedOpOptions options) { return std::make_unique(options); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp index 3991e4731..41c01a096 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp @@ -38,27 +38,54 @@ class AMDAIEDmaCompositionPass void AMDAIEDmaCompositionPass::runOnOperation() { Operation *parentOp = getOperation(); MLIRContext *context = &getContext(); - RewritePatternSet patterns(context); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine when loops " + "can be subsumed into DMA operations, and must be attached to a " + "containing ModuleOp."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + { + RewritePatternSet patterns(context); + populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), + onlyZeroStrideOnOuterDim); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to subsume loops into DMA operations"); + return signalPassFailure(); + } + } { - auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); - std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); - if (!maybeDevice) { - parentOp->emitOpError() - << "has no AMDAIEDevice in the target attribute configuration. This " - "device-specific information is required to determine when loops " - "can be subsumed into DMA operations, and must be attached to a " - "containing ModuleOp."; + RewritePatternSet patterns(context); + populateStridedOpCombinationPattern(patterns); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to compose strided operations"); return signalPassFailure(); } - AMDAIE::AMDAIEDeviceModel deviceModel = - AMDAIE::getDeviceModel(maybeDevice.value()); + } + { + RewritePatternSet patterns(context); + populateCanonicalizeDoublyStridedOpPatterns(patterns, false); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError( + "failed to canonicalize doubly strided DMA operations"); + return signalPassFailure(); + } + } + { + RewritePatternSet patterns(context); populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), onlyZeroStrideOnOuterDim); - } - populateStridedOpCombinationPattern(patterns); - if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { - parentOp->emitOpError("failed to compose strided operations"); - return signalPassFailure(); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to compose strided operations"); + return signalPassFailure(); + } } IRRewriter rewriter(parentOp->getContext()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b49c480f2..0703d9fea 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -580,8 +580,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCompositionPass()); - passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass()); - passManager.addPass(createAMDAIEDmaLoopSubsumptionPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCSEPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h index 0b805d8d6..af4cc394e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h @@ -39,6 +39,10 @@ LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp); LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForallOp forallOp); +/// Populate patterns that canonicalize doubly strided DMA operations. +void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns, + bool foldSingleDims); + /// Populate patterns that subsume loops iterations into DMA access patterns. void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns, AMDAIE::AMDAIEDeviceModel &&deviceModel, From cafd44cda2db0aa00fcf6a72cc35075164e66f55 Mon Sep 17 00:00:00 2001 From: yzhang93 Date: Wed, 9 Oct 2024 15:23:29 -0700 Subject: [PATCH 4/5] Rebase on main --- .../Transforms/AMDAIEDmaComposition.cpp | 60 ++++++------------- .../test/dma_loop_subsumption_circular.mlir | 8 +-- 2 files changed, 21 insertions(+), 47 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp index 41c01a096..9939ae42f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp @@ -38,54 +38,28 @@ class AMDAIEDmaCompositionPass void AMDAIEDmaCompositionPass::runOnOperation() { Operation *parentOp = getOperation(); MLIRContext *context = &getContext(); - - auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); - std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); - if (!maybeDevice) { - parentOp->emitOpError() - << "has no AMDAIEDevice in the target attribute configuration. This " - "device-specific information is required to determine when loops " - "can be subsumed into DMA operations, and must be attached to a " - "containing ModuleOp."; - return signalPassFailure(); - } - AMDAIE::AMDAIEDeviceModel deviceModel = - AMDAIE::getDeviceModel(maybeDevice.value()); - - { - RewritePatternSet patterns(context); - populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), - onlyZeroStrideOnOuterDim); - if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { - parentOp->emitOpError("failed to subsume loops into DMA operations"); - return signalPassFailure(); - } - } + RewritePatternSet patterns(context); { - RewritePatternSet patterns(context); - populateStridedOpCombinationPattern(patterns); - if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { - parentOp->emitOpError("failed to compose strided operations"); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine when loops " + "can be subsumed into DMA operations, and must be attached to a " + "containing ModuleOp."; return signalPassFailure(); } - } - { - RewritePatternSet patterns(context); - populateCanonicalizeDoublyStridedOpPatterns(patterns, false); - if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { - parentOp->emitOpError( - "failed to canonicalize doubly strided DMA operations"); - return signalPassFailure(); - } - } - { - RewritePatternSet patterns(context); + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), onlyZeroStrideOnOuterDim); - if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { - parentOp->emitOpError("failed to compose strided operations"); - return signalPassFailure(); - } + } + populateStridedOpCombinationPattern(patterns); + populateCanonicalizeDoublyStridedOpPatterns(patterns, false); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to compose strided operations"); + return signalPassFailure(); } IRRewriter rewriter(parentOp->getContext()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir index f17e5e00b..dd0b695c1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir @@ -77,8 +77,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { scf.forall (%arg2, %arg3) in (2, 2) { %1 = affine.apply #map(%arg3) - %2 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) + amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) } amdaie.end } @@ -108,8 +108,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { scf.forall (%arg2, %arg3) in (2, 2) { %1 = affine.apply #map(%arg3) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) - %3 = amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) + amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [32, 64] [128, 1]) + amdaie.npu.circular_dma_cpy_nd %0([0] [2048] [1], [] [] []) } amdaie.end } From 099513dc0e43e0e1c6daedfe78afd024bee287a1 Mon Sep 17 00:00:00 2001 From: yzhang93 Date: Wed, 9 Oct 2024 16:04:33 -0700 Subject: [PATCH 5/5] Fix conv failure --- .../AMD-AIE/iree-amd-aie/Transforms/Passes.cpp | 15 ++++++++++++--- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 0703d9fea..154ca126d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -521,7 +521,8 @@ void buildAMDAIETransformPassPipeline( } modulePassManager.addPass(createLowerUKernelOpsToCallsPass()); if (useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) { - addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow); + addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow, + useTilePipeline); } else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) { addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline, matmulElementwiseFusion); @@ -541,12 +542,20 @@ void buildAMDAIETransformPassPipeline( } void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, - bool enablePacketFlow) { + bool enablePacketFlow, + TilePassPipeline useTilePipeline) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createCanonicalizerPass()); + // For matmul pipelines, we do transpose on target side for pack ops to get + // better performance. While for convolution pipelines, the same settings + // cause 'aie.dma_bd' error, so for now keep using transpose on source for + // both pack and unpack ops. + // TODO(vivian): explore the other options for conv ops. AMDAIEConvertToDmaOptions dmaOptions; - dmaOptions.packTransposeOnSource = false; + dmaOptions.packTransposeOnSource = + (useTilePipeline == TilePassPipeline::ConvDecomposePipeline) ? true + : false; dmaOptions.unpackTransposeOnSource = true; passManager.addPass(createAMDAIEConvertToDmaPass(dmaOptions)); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 6ced4ff21..4f274be8e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -16,7 +16,8 @@ namespace mlir::iree_compiler::AMDAIE { /// Add passes to lower to AIE objectFifos. void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, - bool enablePacketFlow); + bool enablePacketFlow, + TilePassPipeline useTilePipeline); /// Add passes to lower from MLIR-AIR through AIE. This is /// currently the default passes used for lowering after IREEs tiling.