From 0690d7c1110a399bc7e7f25054e3a9780c6f1052 Mon Sep 17 00:00:00 2001 From: James Newling Date: Fri, 11 Oct 2024 09:15:34 -0700 Subject: [PATCH 1/3] squash post addressing Jorn's review comments --- .../samples/matmul_pack_peel_objectfifo.mlir | 4 +- .../AMDAIEDistributeCoresAndObjectFifos.cpp | 265 +----------------- .../AMDAIEDistributeL1Allocations.cpp | 213 ++++++++++++++ .../iree-amd-aie/Transforms/AMDAIEDmaUtils.h | 68 ----- .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 5 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 21 ++ .../Transforms/test/CMakeLists.txt | 1 + .../distribute_cores_and_objectfifos.mlir | 89 +----- .../test/distribute_l1_allocations.mlir | 93 ++++++ 12 files changed, 351 insertions(+), 413 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index 59da53759..e902c7e2a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ -// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. +// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index 650de7624..3a8b28f28 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -5,10 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" -#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/Transforms/Transforms.h" -#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -17,7 +15,6 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/Iterators.h" -#include "mlir/IR/Matchers.h" #include "mlir/IR/Verifier.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -61,258 +58,6 @@ struct LocationMapInfo { // AMDAIEDistributeCoresAndObjectFifosPass //===----------------------------------------------------------------------===// -/// Try to detect subview(s) that look like they are 'distributing' aka -/// 'privatizing'. That is, subview(s) that take an L1 memref spanning all -/// L1 memories of the AIE array, and slice it along tile specific dimensions. -/// If one or more identical subviews are found, return the MemRefType of -/// the subview(s). Otherwise, return an empty MemRefType. -MemRefType getDistributedType(memref::AllocOp alloc) { - MemRefType type{}; - for (Operation *allocUser : alloc->getUsers()) { - if (auto subview = dyn_cast(allocUser)) { - Operation::operand_range offsets = subview.getOffsets(); - - // This subview op is contained inside nested scf.for ops. We count how - // how many of these loop ops are annotated with amdaie.unroll, and are - // sliced on their induction variable. For distributed L2 memory, we - // expect this to be exactly 2, and we expect no slicing in other - // dimensions. It is possible to handle other edge cases, but this is left - // for future work. - uint32_t nbNonConstants = - std::count_if(offsets.begin(), offsets.end(), [](Value v) -> bool { - return !mlir::matchPattern(v, mlir::m_Constant()); - }); - if (nbNonConstants != 2) return {}; - uint32_t nbDistributionLoops{0}; - scf::ForOp currentOp = subview->getParentOfType(); - while (currentOp) { - Value iv = currentOp.getInductionVar(); - uint64_t sliceCount = std::count(offsets.begin(), offsets.end(), iv); - if (sliceCount > 1) return {}; - if (sliceCount == 1) { - if (!currentOp->hasAttr(kAMDAIELoopUnroll)) return {}; - ++nbDistributionLoops; - } - currentOp = currentOp->getParentOfType(); - } - if (nbDistributionLoops != 2) return {}; - auto nextType = cast(subview.getResult().getType()); - if (!type) { - type = nextType; - } else if (type != nextType) { - // This is the case where there are 2+ subview ops which look like - // they should be distributing, but they have different result types. - // Bail. - return {}; - } - } - } - return type; -} - -/// Distribute local memory accesses through subviews by allocating a single -/// smaller memory. This is needed because cores can't operate on one larger L1 -/// memory. -LogicalResult distributeLocalMemory(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - SmallVector toBeErased; - - moduleOp->walk([&](memref::AllocOp oldAlloc) { - Attribute maybeMemorySpace = - cast(oldAlloc.getResult().getType()).getMemorySpace(); - if (!maybeMemorySpace) return WalkResult::advance(); - auto memorySpace = cast(maybeMemorySpace); - - // Only consider local memory (L1). - if (memorySpace.getInt() != 2) return WalkResult::advance(); - - // Don't try and distribute memory if the alloc is inside a scf.for op. - if (auto scfForOp = oldAlloc->getParentOfType()) - return WalkResult::advance(); - - MemRefType memRefType = getDistributedType(oldAlloc); - - // Failed to find a memref.subview that looks like it is distributing. - // This doesn't mean that we can't distribute (for example there might be - // no subviews at all), but this requires further work. - if (!memRefType) return WalkResult::advance(); - - ArrayRef newShape = memRefType.getShape(); - Type elementType = memRefType.getElementType(); - - rewriter.setInsertionPoint(oldAlloc); - MemRefType newAllocType = MemRefType::get( - newShape, elementType, MemRefLayoutAttrInterface{}, memorySpace); - auto newAlloc = rewriter.create(rewriter.getUnknownLoc(), - newAllocType); - auto newDeallocOp = - rewriter.create(rewriter.getUnknownLoc(), newAlloc); - - newDeallocOp->moveBefore(&newAlloc->getBlock()->back()); - - // Replace uses of the old alloc with the new alloc. - for (Operation *userOp : oldAlloc->getUsers()) { - LogicalResult switchResult = - llvm::TypeSwitch(userOp) - .Case([&](memref::SubViewOp subviewOp) { - rewriter.replaceAllUsesWith(subviewOp, newAlloc); - toBeErased.push_back(subviewOp); - return success(); - }) - .Case( - [&](vector::TransferReadOp transferReadOp) { - rewriter.setInsertionPoint(transferReadOp); - // Since in this function we're basically changing the L1 - // sizes of the Alloc, for dimensions with size as 1 we need - // to set the indices as 0. We need to do this at this step - // because there would be loop dependencies on the same and - // when we unroll those loops later in this pass we would - // have incorrect offset values being formed for those - // dimensions. - SmallVector newIndices = transferReadOp.getIndices(); - Value c0 = rewriter.create( - transferReadOp.getLoc(), 0); - for (unsigned i = 0, n = newShape.size(); i < n; i++) { - if (newShape[i] == 1) newIndices[i] = c0; - } - - auto newTransferReadOp = - rewriter.create( - transferReadOp.getLoc(), transferReadOp.getType(), - newAlloc, newIndices, - transferReadOp.getPermutationMapAttr(), - transferReadOp.getPadding(), - transferReadOp.getMask(), - transferReadOp.getInBoundsAttr()); - rewriter.replaceAllUsesWith(transferReadOp, - newTransferReadOp.getResult()); - toBeErased.push_back(transferReadOp); - return success(); - }) - .Case( - [&](vector::TransferWriteOp transferWriteOp) { - rewriter.setInsertionPoint(transferWriteOp); - // Since in this function we're basically changing the L1 - // sizes of the Alloc, for dimensions with size as 1 we need - // to set the indices as 0. We need to do this at this step - // because there would be loop dependencies on the same and - // when we unroll those loops later in this pass we would - // have incorrect offset values being formed for those - // dimensions. - SmallVector newIndices = - transferWriteOp.getIndices(); - Value c0 = rewriter.create( - transferWriteOp.getLoc(), 0); - - for (unsigned i = 0, n = newShape.size(); i < n; i++) { - if (newShape[i] == 1) newIndices[i] = c0; - } - - rewriter.create( - transferWriteOp.getLoc(), transferWriteOp.getVector(), - newAlloc, newIndices, - transferWriteOp.getPermutationMapAttr(), - transferWriteOp.getMask(), - transferWriteOp.getInBoundsAttr()); - toBeErased.push_back(transferWriteOp); - return success(); - }) - .Case( - [&](memref::ExtractStridedMetadataOp - extractStridedMetadataOp) { - rewriter.setInsertionPoint(extractStridedMetadataOp); - auto newextractStridedMetadataOp = - rewriter.create( - extractStridedMetadataOp.getLoc(), newAlloc); - rewriter.replaceAllUsesWith( - extractStridedMetadataOp.getResults(), - newextractStridedMetadataOp.getResults()); - toBeErased.push_back(extractStridedMetadataOp); - return success(); - }) - .Case([&](memref::DeallocOp deallocOp) { - toBeErased.push_back(userOp); - return success(); - }) - .Case( - [&rewriter, &newAlloc, &toBeErased]( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - auto type = llvm::cast(newAlloc.getType()); - - // Collect all DmaCpyNdOps which have 'logicalObjectFifo' as - // a source. Currently not handling the case of multiple. - SmallVector dmaOps; - for (Operation *objFifoUserOp : - logicalObjectFifo->getUsers()) { - if (auto dmaOp = - dyn_cast(objFifoUserOp); - dmaOp.getSourceObjectFifo() == logicalObjectFifo) { - dmaOps.push_back(dmaOp); - } - } - if (dmaOps.size() == 0) return success(); - if (dmaOps.size() > 1) { - logicalObjectFifo->emitOpError( - "Case of multiple DMA ops not handled yet (easy " - "extension to logic here)"); - return failure(); - } - AMDAIE::DmaCpyNdOp dmaOp = dmaOps[0]; - - SmallVector empty; - rewriter.setInsertionPoint(logicalObjectFifo); - auto source = - rewriter.create( - rewriter.getUnknownLoc(), - LogicalObjectFifoType::get(type), - newAlloc.getResult()); - rewriter.replaceAllUsesWith(logicalObjectFifo, source); - toBeErased.push_back(logicalObjectFifo); - rewriter.setInsertionPoint(dmaOp); - auto newDmaOp = rewriter.create( - dmaOp.getLoc(), dmaOp.getTarget(), - dmaOp.getTargetMixedOffsets(), - dmaOp.getTargetMixedSizes(), - dmaOp.getTargetMixedStrides(), source, - dmaOp.getSourceMixedOffsets(), - dmaOp.getSourceMixedSizes(), - dmaOp.getSourceMixedStrides()); - rewriter.replaceAllUsesWith(dmaOp, newDmaOp); - // TODO: maybe this should be left to a DCE somewhere, - // instead of manually erasing unused ops? - toBeErased.push_back(dmaOp); - // We have to discard non-zero offsets as subview has - // been replaced by a dedicated allocated memref. - SmallVector allocShape(type.getShape()); - (void)discardAllNonZeroOffsets( - rewriter, - cast( - newDmaOp.getOperation()), - allocShape); - return success(); - }) - .Default([&](Operation *userOp) { - userOp->emitOpError( - "needs to have logic implemented for handling in " - "distributeLocalMemory"); - return failure(); - }); - - if (failed(switchResult)) return WalkResult::interrupt(); - } - toBeErased.push_back(oldAlloc); - - return WalkResult::advance(); - }); - - for (Operation *op : toBeErased) { - op->dropAllUses(); - rewriter.eraseOp(op); - } - - return success(); -} - /// Convert inner scf.forall ops chosen for parallel distribution to scf.for /// ops. LogicalResult localForallToFor(ModuleOp moduleOp) { @@ -649,8 +394,7 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { } for (auto &&[idx, operand] : llvm::enumerate(op->getOpOperands())) { Operation *operandDefiningOp = operand.get().getDefiningOp(); - if (!dyn_cast_if_present(operandDefiningOp)) - continue; + if (!dyn_cast_if_present(operandDefiningOp)) continue; if (memrefToLogicalObjectFifoAccess.contains(operand.get())) { op->setOperand(idx, memrefToLogicalObjectFifoAccess[operand.get()]); } else if (memrefToLogicalObjectFifo.contains(operand.get())) { @@ -956,13 +700,6 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { LLVM_DEBUG(llvm::dbgs() << "Module after localForallToFor: \n" << moduleOp << "\n"); - if (failed(distributeLocalMemory(moduleOp))) { - moduleOp.emitOpError() << "local memory distribution failed"; - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() << "Module after distributeLocalMemory: \n" - << moduleOp << "\n"); - if (failed(verify(moduleOp, true))) { return signalPassFailure(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp new file mode 100644 index 000000000..63e797356 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp @@ -0,0 +1,213 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "llvm/ADT/TypeSwitch.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h" + +#define DEBUG_TYPE "iree-amdaie-distribute-l1-allocations" + +namespace mlir::iree_compiler::AMDAIE { + +using namespace mlir; + +namespace { + +/// Find all induction variables of all `scf.forall` ops that are mapped to +/// gpu thread dimensions (as opposed to gpu block dimensions etc). +FailureOr> getThreadIndVars(ModuleOp moduleOp) { + DenseSet threadIndVars; + moduleOp.walk([&](scf::ForallOp forallOp) { + std::optional maybeMapping = forallOp.getMapping(); + if (!maybeMapping) return WalkResult::advance(); + SmallVector mapping = llvm::to_vector(maybeMapping->getValue()); + if (mapping.empty()) return WalkResult::advance(); + if (!isa(*mapping.begin())) + return WalkResult::advance(); + for (Value indVar : forallOp.getInductionVars()) { + threadIndVars.insert(indVar); + } + return WalkResult::advance(); + }); + return threadIndVars; +} + +/// Try to detect subview(s) that look like they're 'distributing' L1 memory. +/// That is: they slice the L1 memory along thread/tile dimensions. +MemRefType getDistributedType(memref::AllocOp alloc, + const DenseSet &indVars) { + MemRefType type; + for (Operation *allocUser : alloc->getUsers()) { + if (auto subview = dyn_cast(allocUser)) { + // Check that all offsets are either constants or thread ids. We assume + // that if a subview has an offset which is not a constant and not a + // thread id, it's not 'distributing'. + Operation::operand_range offsets = subview.getOffsets(); + for (Value offset : offsets) { + bool isConst = matchPattern(offset, m_Constant()); + bool isIndVar = llvm::is_contained(indVars, offset); + if (!isConst && !isIndVar) return {}; + } + + auto nextType = cast(subview.getResult().getType()); + if (!type) { + type = nextType; + } else if (type != nextType) { + // This is the case where there are 2+ subview ops which look like + // they should be distributing, but they have different result types. + // Bail. + return {}; + } + } + } + return type; +} + +/// Create a copy of `toUpdate` with all values in `toRemove` replaced by +/// `replacement`. +template +SmallVector substitute(Container toUpdate, + const DenseSet &toRemove, + Value replacement) { + SmallVector updated(toUpdate.begin(), toUpdate.end()); + for (Value &v : updated) { + if (toRemove.contains(v)) v = replacement; + } + return updated; +} + +/// Distribute local memory accesses through subviews by allocating a single, +/// smaller memory. This is ultimately needed because cores can't operate on +/// one shared L1 memory. +LogicalResult distributeLocalMemory(ModuleOp moduleOp) { + FailureOr> maybeIndVars = getThreadIndVars(moduleOp); + if (failed(maybeIndVars)) return failure(); + const DenseSet &indVars = maybeIndVars.value(); + IRRewriter rewriter(moduleOp.getContext()); + moduleOp->walk([&](memref::AllocOp oldAlloc) { + // Only consider local memory (L1). + Attribute maybeMemorySpace = oldAlloc.getType().getMemorySpace(); + if (!maybeMemorySpace) return WalkResult::advance(); + auto memorySpace = cast(maybeMemorySpace); + if (memorySpace.getInt() != 2) return WalkResult::advance(); + + // Don't try and distribute memory if the alloc is inside a scf.for op. + if (auto scfForOp = oldAlloc->getParentOfType()) + return WalkResult::advance(); + + MemRefType memRefType = getDistributedType(oldAlloc, indVars); + + // Failed to find a memref.subview that looks like it is distributing. + // This doesn't mean that we can't distribute (for example there might be + // no subviews at all), but this requires further work. + if (!memRefType) return WalkResult::advance(); + + ArrayRef newShape = memRefType.getShape(); + Type elementType = memRefType.getElementType(); + + rewriter.setInsertionPoint(oldAlloc); + MemRefType newAllocType = MemRefType::get( + newShape, elementType, MemRefLayoutAttrInterface{}, memorySpace); + auto newAlloc = rewriter.create(rewriter.getUnknownLoc(), + newAllocType); + + const SmallVector users(oldAlloc->user_begin(), + oldAlloc->user_end()); + + // Replace uses of the old alloc with the new alloc. + for (Operation *user : users) { + LogicalResult switchResult = + llvm::TypeSwitch(user) + .Case([&](memref::SubViewOp subviewOp) { + rewriter.replaceAllUsesWith(subviewOp, newAlloc); + return success(); + }) + .Case([&](vector::TransferReadOp readOp) { + rewriter.setInsertionPoint(readOp); + Value c0 = + rewriter.create(readOp.getLoc(), 0); + SmallVector indices = + substitute(readOp.getIndices(), indVars, c0); + rewriter.replaceOpWithNewOp( + readOp, readOp.getType(), newAlloc, indices, + readOp.getPermutationMapAttr(), readOp.getPadding(), + readOp.getMask(), readOp.getInBoundsAttr()); + return success(); + }) + .Case( + [&](vector::TransferWriteOp writeOp) { + rewriter.setInsertionPoint(writeOp); + Value c0 = rewriter.create( + writeOp.getLoc(), 0); + SmallVector indices = + substitute(writeOp.getIndices(), indVars, c0); + rewriter.replaceOpWithNewOp( + writeOp, writeOp.getVector(), newAlloc, indices, + writeOp.getPermutationMapAttr(), writeOp.getMask(), + writeOp.getInBoundsAttr()); + return success(); + }) + .Case( + [&](memref::ExtractStridedMetadataOp + extractStridedMetadataOp) { + rewriter + .replaceOpWithNewOp( + extractStridedMetadataOp, newAlloc); + return success(); + }) + .Case([&](memref::DeallocOp deallocOp) { + rewriter.setInsertionPoint(deallocOp); + rewriter.create(rewriter.getUnknownLoc(), + newAlloc); + return success(); + }) + .Default([&](Operation *user) { + user->emitOpError("needs logic implemented for handling."); + return failure(); + }); + + if (failed(switchResult)) return WalkResult::interrupt(); + } + + return WalkResult::advance(); + }); + + return success(); +} + +class AMDAIEDistributeL1AllocationsPass + : public impl::AMDAIEDistributeL1AllocationsBase< + AMDAIEDistributeL1AllocationsPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEDistributeL1AllocationsPass() = default; + AMDAIEDistributeL1AllocationsPass( + const AMDAIEDistributeL1AllocationsPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEDistributeL1AllocationsPass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + if (failed(distributeLocalMemory(moduleOp))) return signalPassFailure(); +} +} // namespace + +std::unique_ptr createAMDAIEDistributeL1AllocationsPass() { + return std::make_unique(); +} +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h index dc9f88c23..567c46bea 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h @@ -154,74 +154,6 @@ LogicalResult foldUnitDims(const SmallVector &offsets, SmallVector &newStrides, SmallVector &newSizes); -/// Utility to discard all non-zero offsets that have dimension equal to 1 on -/// the same index of the provided shape. This helps with updating DMA -/// operations for a shape change. If an empty shape is passed, all non-zero -/// offsets will be removed. -template -AMDAIE::DoublyStridedOpInterface discardAllNonZeroOffsets( - RewriterBase &rewriter, AMDAIE::DoublyStridedOpInterface op, - SmallVector &shape) { - SmallVector newSourceOffsets; - SmallVector newSourceSizes; - SmallVector newSourceStrides; - SmallVector newTargetOffsets; - SmallVector newTargetSizes; - SmallVector newTargetStrides; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - SmallVector offsets = op.getSourceMixedOffsets(); - SmallVector sizes = op.getSourceMixedSizes(); - SmallVector strides = op.getSourceMixedStrides(); - // Set shape to a vector of ones as a default. - if (shape.empty()) { - SmallVector ones(offsets.size(), 1); - shape = ones; - } - if (shape.size() != offsets.size()) return op; - // Fill source offsets/sizes/strides. - for (auto &&[offset, size, stride, dim] : - llvm::zip(offsets, sizes, strides, shape)) { - std::optional constantOffset = getConstantIntValue(offset); - if (dim == 1 && !constantOffset) continue; - if (dim == 1 && constantOffset && constantOffset.value() != 0) continue; - newSourceOffsets.push_back(offset); - newSourceSizes.push_back(size); - newSourceStrides.push_back(stride); - } - newTargetOffsets = op.getTargetMixedOffsets(); - newTargetSizes = op.getTargetMixedSizes(); - newTargetStrides = op.getTargetMixedStrides(); - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - SmallVector offsets = op.getTargetMixedOffsets(); - SmallVector sizes = op.getTargetMixedSizes(); - SmallVector strides = op.getTargetMixedStrides(); - // Set shape to a vector of ones as a default. - if (shape.empty()) { - SmallVector ones(offsets.size(), 1); - shape = ones; - } - if (shape.size() != offsets.size()) return op; - // Fill source offsets/sizes/strides. - for (auto &&[offset, size, stride, dim] : - llvm::zip(offsets, sizes, strides, shape)) { - std::optional constantOffset = getConstantIntValue(offset); - if (dim == 1 && !constantOffset) continue; - if (dim == 1 && constantOffset && constantOffset.value() != 0) continue; - newTargetOffsets.push_back(offset); - newTargetSizes.push_back(size); - newTargetStrides.push_back(stride); - } - newSourceOffsets = op.getSourceMixedOffsets(); - newSourceSizes = op.getSourceMixedSizes(); - newSourceStrides = op.getSourceMixedStrides(); - } - rewriter.setInsertionPointAfter(op); - auto newDoublyStridedOp = op.createDoublyStridedOp( - rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, - newSourceOffsets, newSourceSizes, newSourceStrides); - rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); - return newDoublyStridedOp; -} /// Utility DMA configuration which is calculated based on AMDAIEDeviceModel /// information. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index d2a7cb0e2..a467ce00e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -63,6 +63,7 @@ iree_cc_library( "AMDAIECreateAIEWorkgroup.cpp" "AMDAIECreateReferenceToAllocation.cpp" "AMDAIEDistributeCoresAndObjectFifos.cpp" + "AMDAIEDistributeL1Allocations.cpp" "AMDAIEDmaCSE.cpp" "AMDAIEDmaComposition.cpp" "AMDAIEDmaLoopSubsumption.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 7c7105b0b..4cd5586f0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -41,6 +41,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS +#define GEN_PASS_DEF_AMDAIEDISTRIBUTEL1ALLOCATIONS #define GEN_PASS_DEF_AMDAIEDMACOMPOSITION #define GEN_PASS_DEF_AMDAIEDMACSE #define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 454d54ae9..4bc7c8bc4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -545,6 +545,11 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, TilePassPipeline useTilePipeline) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); + + passManager.addPass(createAMDAIEDistributeL1AllocationsPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); // For matmul pipelines, we do transpose on target side for pack ops to get // better performance. While for convolution pipelines, the same settings diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 00aa88694..df670e19f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -138,6 +138,9 @@ std::unique_ptr createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass(); /// operations and distribute the logical objectFifos. std::unique_ptr createAMDAIEDistributeCoresAndObjectFifosPass(); +/// Create pass to distribute/privatize/localize memory alloocations in L1 memory +std::unique_ptr createAMDAIEDistributeL1AllocationsPass(); + /// Create a pass to compose more complex DMA operations, e.g. by combining DMA /// operations and/or subsuming loop iterations into the strided access /// patterns. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index e6910bbde..7c8364fed 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -182,6 +182,27 @@ def AMDAIEDistributeCoresAndObjectFifos : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()"; } +def AMDAIEDistributeL1Allocations : + Pass<"iree-amdaie-distribute-l1-allocations", "ModuleOp"> { + let summary = "Replace distributed L1 allocations with private allocations."; + let description = [{ + Each AIE core/tile is uniquely identified by gpu thread ids, usually + 'y' (for AIE row) and 'x' (for AIE column). + + Some of the compilation pipelines in iree-amd-aie generate a single L1 + memory allocation describing the concatenation of all memory for all + cores/tiles. Each thread then slices into a mutually exclusive rectangle + of the allocation, along its thread dimensions, so 'privatizing' its + memory. + + This pass rewrites these allocations to be private to each core/tile. So + it replaces a large allocation in L1 with a smaller allocation, smaller by + a factor of the number of cores/threads. + }]; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeL1AllocationsPass()"; +} + + def AMDAIEDmaComposition : Pass<"iree-amdaie-dma-composition"> { let summary = "Compose DMA operations by DMA combination and loop subsumption."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index c083125bd..570e66b83 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -28,6 +28,7 @@ iree_lit_test_suite( "create_reference_to_allocation.mlir" "disable_vectorization.mlir" "distribute_cores_and_objectfifos.mlir" + "distribute_l1_allocations.mlir" "dma_composition.mlir" "dma_cse.mlir" "dma_loop_subsumption_circular.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index 6daf18fc9..e36dcf26c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -1,3 +1,4 @@ +// R UN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations,iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // Check for unrolling an amdaie.core within a parallel loop with a single @@ -571,76 +572,6 @@ module { } } -// ----- - -// Ensure subviews on local memrefs inside cores are handled correctly by discarding the consuming DMAs' non-zero offsets. -// CHECK-LABEL: @local_subview_output -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[ALLOC_0:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2> -// CHECK-DAG: %[[ALLOC_1:.+]] = memref.alloc() : memref<2x2x32x32xi32, 1> -// CHECK-DAG: %[[ALLOC_2:.+]] = memref.alloc() : memref<64x64xi32> -// CHECK: scf.forall (%{{.+}}, %[[ARG1:.+]]) in (2, 2) -// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]]) -// CHECK-DAG: %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_2]]} -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_3]]} -// CHECK-DAG: %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_3]]} -// CHECK-DAG: %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_0]]} -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_0]][0, 0] [32, 32] [32, 1] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [], out : [%[[DMA_0]]]) -// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_0]], Write) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_1]][0, 0] [32, 32] [32, 1] -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [], out : [%[[DMA_1]]]) -// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Write) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_2]][0, 0] [32, 32] [32, 1] -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [], out : [%[[DMA_2]]]) -// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Write) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_3]][0, 0] [32, 32] [32, 1] -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [], out : [%[[DMA_3]]]) -// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Write) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -module { - func.func @local_subview_output() { - %c0_i32 = arith.constant 0 : i32 - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 2> - %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<64x64xi32> - scf.forall (%arg0, %arg1) in (2, 2) { - %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 2> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x2x32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<64x64xi32> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (2, 2) { - %subview = memref.subview %alloc_0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2> - %8 = amdaie.dma_cpy_nd(%1[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %add = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %add) - %core = amdaie.core(%tile, in : [], out : [%8]) { - linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - %9 = amdaie.dma_cpy_nd(%2[%arg1] [%c1] [%c1], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_2 : memref<64x64xi32> - memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1> - memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 2> - return - } -} // ----- @@ -662,19 +593,19 @@ module { func.func @l1_temporary_buffer_for_matmul_elem() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index - %alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32> + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> scf.forall (%arg0, %arg1) in (1, 1) { scf.forall (%arg2, %arg3) in (1, 1) { - %subview = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> + %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> %26 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %26) %27 = amdaie.core(%tile, in : [], out : []) { - linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> return } @@ -1013,27 +944,27 @@ module { %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> scf.forall (%arg0, %arg1) in (1, 1) { %13 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> scf.forall (%arg2, %arg3) in (1, 1) { %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> + %subview = memref.subview %alloc_3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> %21 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %21) %22 = amdaie.core(%tile, in : [%19, %20], out : []) { - linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) %base_buffer, %offset, %sizes:6, %strides:6 = memref.extract_strided_metadata %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index %base_buffer_5, %offset_6, %sizes_7:6, %strides_8:6 = memref.extract_strided_metadata %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index - %base_buffer_9, %offset_10, %sizes_11:6, %strides_12:6 = memref.extract_strided_metadata %subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index + %base_buffer_9, %offset_10, %sizes_11:6, %strides_12:6 = memref.extract_strided_metadata %subview : memref<1x1x8x8x4x4xi32, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index func.call @matmul_i32_i32(%base_buffer, %c0, %base_buffer_5, %c0, %base_buffer_9, %offset_10) : (memref, index, memref, index, memref, index) -> () amdaie.end } {elf_file = "/path/to/ukernel.o"} } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32> memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32> memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir new file mode 100644 index 000000000..1d3c38d7a --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir @@ -0,0 +1,93 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// ----- + +// CHECK-LABEL: distribute_l1_memory_test_0 + +// The L2 allocation becomes private to each thread: +// CHECK: %[[L2ALLOC:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2> + +// The linalg.fill acts directly on the private allocation, not a view of the +// shared allocation: +// CHECK: linalg.fill +// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>) +// CHECK: linalg.fill +// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>) +// CHECK: memref.dealloc %[[L2ALLOC]] : memref<1x1x32x32xi32, 2> + +func.func @distribute_l1_memory_test_0() { + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<2x2x32x32xi32, 2> + scf.forall (%arg2, %arg3) in (2, 2) { + %subview = memref.subview %alloc[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2> + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>) + } {mapping = [#gpu.thread, #gpu.thread]} + scf.forall (%arg2, %arg3) in (2, 2) { + %subview = memref.subview %alloc[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2> + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>) + } {mapping = [#gpu.thread, #gpu.thread]} + memref.dealloc %alloc : memref<2x2x32x32xi32, 2> + return +} + +// ----- + +// CHECK-LABEL: @transfer_read_test() +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2> +// CHECK: vector.transfer_read %[[ALLOC]] +// CHECK-SAME: memref<1x8xbf16, 2>, vector<1x8xbf16> + +func.func @transfer_read_test(){ + %alloc = memref.alloc() : memref<4x8xbf16, 2> + scf.forall (%arg0) in (4) { + %c0 = arith.constant 0 : index + %c0_bf16 = arith.constant 0.000000e+00 : bf16 + %subview = memref.subview %alloc[%arg0, 0] [1, 8] [1, 1] : + memref<4x8xbf16, 2> to memref<1x8xbf16, strided<[8, 1], offset: ?>, 2> + %vector = vector.transfer_read %subview[%c0, %c0], %c0_bf16 {in_bounds = [true, true]} : + memref<1x8xbf16, strided<[8, 1], offset: ?>, 2>, vector<1x8xbf16> + } {mapping = [#gpu.thread]} + return +} + +// ----- + +// CHECK: @transfer_write_test(%[[VECTOR:.+]]: vector<1x8xbf16>) +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2> +// CHECK: scf.forall +// CHECK: vector.transfer_write %[[VECTOR]], %[[ALLOC]] +// CHECK-SAME: vector<1x8xbf16>, memref<1x8xbf16, 2> + +func.func @transfer_write_test(%vector : vector<1x8xbf16>){ + %alloc = memref.alloc() : memref<4x8xbf16, 2> + scf.forall (%arg0) in (4) { + %c0 = arith.constant 0 : index + %c0_bf16 = arith.constant 0.000000e+00 : bf16 + %subview = memref.subview %alloc[%arg0, 0] [1, 8] [1, 1] : + memref<4x8xbf16, 2> to memref<1x8xbf16, strided<[8, 1], offset: ?>, 2> + vector.transfer_write %vector, %subview[%c0, %c0] {} : vector<1x8xbf16>, memref<1x8xbf16, strided<[8, 1], offset: ?>, 2> + } {mapping = [#gpu.thread]} + return +} + +// ----- + +// Example where the subview cannot be determined to be distributing: + +// CHECK-LABEL: @non_distributing_subview +// CHECK-NOT: memref.alloc() : memref<1x4xbf16, 2> +// CHECK: return + +func.func @non_distributing_subview(%index : index) { + %alloc = memref.alloc() : memref<4x8xbf16, 2> + scf.forall (%arg0) in (4) { + %c0 = arith.constant 0 : index + %c0_bf16 = arith.constant 0.000000e+00 : bf16 + %subview = memref.subview %alloc[%arg0, %index] [1, 4] [1, 1] : + memref<4x8xbf16, 2> to memref<1x4xbf16, strided<[8, 1], offset: ?>, 2> + } {mapping = [#gpu.thread]} + return +} + + + From 725bdc91273967ddb44178b7f02ede593a3405bb Mon Sep 17 00:00:00 2001 From: James Newling Date: Fri, 11 Oct 2024 09:18:55 -0700 Subject: [PATCH 2/3] for loop {} --- .../iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp index 63e797356..710d0ddfb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp @@ -36,9 +36,8 @@ FailureOr> getThreadIndVars(ModuleOp moduleOp) { if (mapping.empty()) return WalkResult::advance(); if (!isa(*mapping.begin())) return WalkResult::advance(); - for (Value indVar : forallOp.getInductionVars()) { + for (Value indVar : forallOp.getInductionVars()) threadIndVars.insert(indVar); - } return WalkResult::advance(); }); return threadIndVars; From ff1b9c107db5fc7eb5366b1fe5c6a4bfc9f74c82 Mon Sep 17 00:00:00 2001 From: James Newling Date: Fri, 11 Oct 2024 11:18:58 -0700 Subject: [PATCH 3/3] remove old line --- .../Transforms/test/distribute_cores_and_objectfifos.mlir | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index e36dcf26c..b00ccb6b0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -1,4 +1,3 @@ -// R UN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations,iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // Check for unrolling an amdaie.core within a parallel loop with a single