diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIECoreToStandard.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIECoreToStandard.cpp index 2fa1d3f90..6696fe4c5 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIECoreToStandard.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIECoreToStandard.cpp @@ -6,15 +6,10 @@ #include "AIEDialect.h" #include "Passes.h" -#include "aievec/AIEVecDialect.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/Math/IR/Math.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/PatternMatch.h" @@ -24,148 +19,109 @@ #define DEBUG_TYPE "amdaie-standard-lowering" using namespace mlir; -using namespace mlir::vector; -using namespace xilinx; using namespace xilinx::AIE; -struct AMDAIEUseLockToStdLowering : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite( - UseLockOp useLock, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!isa(useLock->getParentOp())) { - // Generate the intrinsic name - std::string funcName = "llvm.aie2."; - if (useLock.getAction() == LockAction::Acquire || - useLock.getAction() == LockAction::AcquireGreaterEqual) - funcName += "acquire"; - else if (useLock.getAction() == LockAction::Release) - funcName += "release"; - // TODO(max): this can be simplified with - // SymbolTable::lookupNearestSymbolFrom if DeviceOp ceases to be a - // SymbolTable - func::FuncOp useLockFunc = - useLock->getParentOfType().lookupSymbol( - funcName); - - SmallVector args; - int lockValue = useLock.getValue().value_or(1); - - // AIE2 acquire greater equal is encoded as a negative value. - if (useLock.getAction() == LockAction::AcquireGreaterEqual) - lockValue = -lockValue; - args.push_back(rewriter.create( - useLock.getLoc(), IntegerType::get(rewriter.getContext(), 32), - useLock.getLock())); - args.push_back(rewriter.create( - useLock.getLoc(), IntegerType::get(rewriter.getContext(), 32), - rewriter.getI32IntegerAttr(lockValue))); - rewriter.create(rewriter.getUnknownLoc(), useLockFunc, - args); - } +static void lockToStd(UseLockOp useLock, IRRewriter &rewriter) { + if (!isa(useLock->getParentOp())) { + std::string funcName = [&]() { + switch (useLock.getAction()) { + case LockAction::Acquire: + case LockAction::AcquireGreaterEqual: + return "llvm.aie2.acquire"; + case LockAction::Release: + return "llvm.aie2.release"; + default: + assert(false && "Unknown lock action"); + } + }(); + + // TODO(max): this can be simplified with + // SymbolTable::lookupNearestSymbolFrom if DeviceOp ceases to be a + // SymbolTable + ModuleOp modOp = useLock->getParentOfType(); + func::FuncOp func = modOp.lookupSymbol(funcName); + + int lockValue = useLock.getValue().value_or(1); + + // AIE2 acquire greater equal is encoded as a negative value. + if (useLock.getAction() == LockAction::AcquireGreaterEqual) + lockValue = -lockValue; - rewriter.eraseOp(useLock); - return success(); + rewriter.setInsertionPoint(useLock); + IntegerAttr lockAttr = rewriter.getI32IntegerAttr(lockValue); + IntegerType type = IntegerType::get(rewriter.getContext(), 32); + Location loc = useLock.getLoc(); + + SmallVector args{ + rewriter.create(loc, type, useLock.getLock()), + rewriter.create(loc, type, lockAttr)}; + + rewriter.create(loc, func, args); } -}; -struct AMDAIEBufferToStandard : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - ModuleOp &module; - // TODO(max): these should be optionals instead of checking against -1 - // but the pass itself needs to be updated. - int tileCol = 0; - int tileRow = 0; - AMDAIEBufferToStandard(MLIRContext *context, ModuleOp &m, int tileCol = -1, - int tileRow = -1) - : OpConversionPattern(context), - module(m), - tileCol(tileCol), - tileRow(tileRow) {} - LogicalResult matchAndRewrite( - BufferOp buffer, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - rewriter.setInsertionPointToStart(module.getBody()); - auto t = llvm::cast(buffer.getType()); - StringRef symName = name(buffer).getValue(); - // Don't emit initialization for cores that don't "own" the buffer (to - // prevent duplication in the data section of the elf/object file) - rewriter.create( - rewriter.getUnknownLoc(), symName, rewriter.getStringAttr("public"), - buffer.getType(), nullptr, /*constant*/ false, - /*alignment*/ nullptr); - - for (OpOperand &use : make_early_inc_range(buffer.getResult().getUses())) { - Operation *user = use.getOwner(); - rewriter.setInsertionPoint(user); - auto allocated = rewriter.create( - rewriter.getUnknownLoc(), t, symName); - // Assume that buffers are aligned so they can be vectorized. - rewriter.create(rewriter.getUnknownLoc(), - allocated, 32); - use.set(allocated.getResult()); - } + rewriter.eraseOp(useLock); +} - rewriter.eraseOp(buffer); - return success(); +static void bufferToStd(ModuleOp module, BufferOp buffer, + IRRewriter &rewriter) { + Location loc = buffer.getLoc(); + rewriter.setInsertionPointToStart(module.getBody()); + StringRef symName = name(buffer).getValue(); + MemRefType type = llvm::cast(buffer.getType()); + // Don't emit initialization for cores that don't "own" the buffer (to + // prevent duplication in the data section of the elf/object file) + rewriter.create( + loc, symName, rewriter.getStringAttr("public"), type, nullptr, + /*constant*/ false, + /*alignment*/ nullptr); + + for (OpOperand &use : make_early_inc_range(buffer.getResult().getUses())) { + Operation *user = use.getOwner(); + rewriter.setInsertionPoint(user); + + auto allocated = rewriter.create(loc, type, symName); + // Assume that buffers are aligned so they can be vectorized. + rewriter.create(loc, allocated, 32); + use.set(allocated.getResult()); } -}; -struct AMDAIECoreToStandardFunc : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - IRMapping &mapper; - // TODO(max): these should be optionals instead of checking against -1 - // but the pass itself needs to be updated. - int tileCol = 0; - int tileRow = 0; - - AMDAIECoreToStandardFunc(MLIRContext *context, IRMapping &mapper, - int tileCol = 1, int tileRow = 1) - : OpConversionPattern(context), - mapper(mapper), - tileCol(tileCol), - tileRow(tileRow) {} - - LogicalResult matchAndRewrite( - CoreOp coreOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - TileOp t = getTileOp(*coreOp); - int col = t.getCol(); - int row = t.getRow(); - - // Only pull code for the indicated function - if ((tileRow != row && tileRow != -1) || - (tileCol != col && tileCol != -1)) { - rewriter.eraseOp(coreOp); - return success(); - } + rewriter.eraseOp(buffer); +} - // The parent should be an AIE.device op. - rewriter.setInsertionPointAfter(coreOp->getParentOp()); - - std::string coreName("core_" + std::to_string(col) + "_" + - std::to_string(row)); - auto coreFunc = rewriter.create( - rewriter.getUnknownLoc(), coreName, - FunctionType::get(rewriter.getContext(), {}, {})); - - rewriter.cloneRegionBefore(coreOp.getBody(), coreFunc.getBody(), - coreFunc.getBody().begin(), mapper); - - // Rewrite the AIE.end() op - coreFunc.getBody().walk([&](Operation *childOp) { - rewriter.setInsertionPointAfter(childOp); - if (isa(childOp)) { - rewriter.create(rewriter.getUnknownLoc(), - ValueRange({})); - rewriter.eraseOp(childOp); - } - }); +static void coreToStd(CoreOp coreOp, IRRewriter &rewriter, int tileCol, + int tileRow) { + TileOp t = getTileOp(*coreOp); + int col = t.getCol(); + int row = t.getRow(); + // Only pull code for the indicated function + if ((tileRow != row && tileRow != -1) || (tileCol != col && tileCol != -1)) { rewriter.eraseOp(coreOp); - return success(); + return; } -}; + + // The parent should be an AIE.device op. + rewriter.setInsertionPointAfter(coreOp->getParentOp()); + + // LLVM-style of the above (creating a string attribute): + std::string fName = "core_" + std::to_string(col) + "_" + std::to_string(row); + auto coreFunc = rewriter.create( + rewriter.getUnknownLoc(), fName, + FunctionType::get(rewriter.getContext(), {}, {})); + + IRMapping mapper; + rewriter.cloneRegionBefore(coreOp.getBody(), coreFunc.getBody(), + coreFunc.getBody().begin(), mapper); + + // Rewrite the AIE.end op + coreFunc.getBody().walk([&](EndOp endOp) { + rewriter.setInsertionPointAfter(endOp); + rewriter.create(endOp->getLoc(), ValueRange({})); + rewriter.eraseOp(endOp); + }); + + rewriter.eraseOp(coreOp); +} // Move all the ops with OpTy inside device, to just before the device. template @@ -211,6 +167,30 @@ struct AMDAIECoreToStandardPass : mlir::OperationPass { llvm::cl::desc("Y coordinate of tile to generate code for"), llvm::cl::init(-1)}; + // Assert that cores are isolated + static bool coresAreIsolated(ModuleOp m) { + SmallVector coreOps; + m->walk([&](CoreOp coreOp) { coreOps.push_back(coreOp); }); + for (CoreOp coreOp : coreOps) { + auto walkResult = coreOp->walk([&](Operation *childOp) { + if (childOp == coreOp) return WalkResult::advance(); + for (Value operand : childOp->getOperands()) { + if (Operation *operandOp = operand.getDefiningOp()) { + if (!coreOp->isAncestor(operandOp)) { + operandOp->emitOpError( + "is not in the core in which it is used. Cores must be " + "`isolated` before this point."); + return WalkResult::interrupt(); + } + } + } + return WalkResult::advance(); + }); + if (walkResult.wasInterrupted()) return false; + } + return true; + } + void runOnOperation() override { ModuleOp m = getOperation(); @@ -219,79 +199,39 @@ struct AMDAIECoreToStandardPass : mlir::OperationPass { return signalPassFailure(); } - OpBuilder builder = OpBuilder::atBlockEnd(m.getBody()); + MLIRContext *ctx = &getContext(); + IRRewriter rewriter(ctx); + rewriter.setInsertionPointToEnd(m.getBody()); - // Ensure that we don't have an incorrect target triple. This may override + // Ensure that we don't have an incorrect target triple. This may override // some bogus target triple in the original mlir. m->setAttr(LLVM::LLVMDialect::getTargetTripleAttrName(), - builder.getStringAttr("aie2")); - - IRMapping mapper; - ConversionTarget target(getContext()); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalDialect(); - target.addLegalOp(); - - RewritePatternSet patterns(&getContext()); - - StringAttr privateSym = StringAttr::get(&getContext(), "private"); + rewriter.getStringAttr("aie2")); + + StringAttr privateSym = StringAttr::get(ctx, "private"); auto buildDecl = [&](const std::string &funcName) { - builder.create( - builder.getUnknownLoc(), funcName, - FunctionType::get(builder.getContext(), - {builder.getI32Type(), builder.getI32Type()}, {}), + rewriter.create( + rewriter.getUnknownLoc(), funcName, + FunctionType::get(ctx, {rewriter.getI32Type(), rewriter.getI32Type()}, + {}), privateSym, ArrayAttr{}, ArrayAttr{}); }; buildDecl("llvm.aie2.acquire"); buildDecl("llvm.aie2.release"); - patterns.add(m.getContext()); - patterns.add(m.getContext(), m, tileCol, tileRow); - if (failed(applyPartialConversion(m, target, std::move(patterns)))) - return signalPassFailure(); + m.walk([&](UseLockOp useLock) { lockToStd(useLock, rewriter); }); - // Assert that cores are isolated - { - SmallVector coreOps; - m->walk([&](CoreOp coreOp) { coreOps.push_back(coreOp); }); - for (CoreOp coreOp : coreOps) { - auto walkResult = coreOp->walk([&](Operation *childOp) { - if (childOp == coreOp) return WalkResult::advance(); - for (Value operand : childOp->getOperands()) { - if (Operation *operandOp = operand.getDefiningOp()) { - if (!coreOp->isAncestor(operandOp)) { - operandOp->emitOpError( - "is not in the core in which it is used. Cores must be " - "`isolated` before this point."); - return WalkResult::interrupt(); - } - } - } - return WalkResult::advance(); - }); - if (walkResult.wasInterrupted()) return signalPassFailure(); - } - } - RewritePatternSet outlinePatterns(&getContext()); - outlinePatterns.add(m.getContext(), mapper, - tileCol, tileRow); - if (failed(applyPartialConversion(m, target, std::move(outlinePatterns)))) - return signalPassFailure(); + m.walk([&](BufferOp buffer) { bufferToStd(m, buffer, rewriter); }); + + if (!coresAreIsolated(m)) return signalPassFailure(); - // Move all the func.func ops and memref.globals from the device to the - // module + m.walk( + [&](CoreOp coreOp) { coreToStd(coreOp, rewriter, tileCol, tileRow); }); + + // Move all the func.func ops and memref.globals from device to module. DeviceOp device = *m.getOps().begin(); outlineOps(device); outlineOps(device); - - MLIRContext &context = getContext(); - IRRewriter rewriter(&context); rewriter.eraseOp(device); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp index 229e86e53..e1fdc46f8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp @@ -60,8 +60,14 @@ bool areAccessPatternsCombinable(const SmallVector &offsetsA, } if (strideA != strideB) return false; } + + // Don't check the outermost dimension of size at this point. + SmallVector innerSizesA; + SmallVector innerSizesB; + std::copy(sizesA.begin() + 1, sizesA.end(), std::back_inserter(innerSizesA)); + std::copy(sizesB.begin() + 1, sizesB.end(), std::back_inserter(innerSizesB)); for (auto &&[sizeA, sizeB] : - llvm::zip(llvm::reverse(sizesA), llvm::reverse(sizesB))) { + llvm::zip(llvm::reverse(innerSizesA), llvm::reverse(innerSizesB))) { std::optional maybeSizeA = getConstantIntValue(sizeA); std::optional maybeSizeB = getConstantIntValue(sizeB); // Handle static and constant value with same int value. @@ -71,6 +77,20 @@ bool areAccessPatternsCombinable(const SmallVector &offsetsA, if (sizeA != sizeB) return false; } + // Edge case for sizesA[0] != sizesB[0]. + if (offsetsB.size() == offsetsA.size() && sizesA[0] != sizesB[0]) { + std::optional constOffsetA = getConstantIntValue(offsetsA[0]); + std::optional constSizeA = getConstantIntValue(sizesA[0]); + std::optional constOffsetB = getConstantIntValue(offsetsB[0]); + std::optional constSizeB = getConstantIntValue(sizesB[0]); + if (constOffsetA && constOffsetB && constSizeA && constSizeB) { + int64_t offsetDiff = constOffsetB.value() - constOffsetA.value(); + if (constSizeA.value() != offsetDiff) return false; + } else { + return false; + } + } + bool foundDiff{false}; for (auto iter : llvm::enumerate( llvm::zip(llvm::reverse(offsetsA), llvm::reverse(offsetsB)))) { @@ -169,40 +189,50 @@ LogicalResult combineAccessPatterns(RewriterBase &rewriter, if (!size) return failure(); newSizes[0] = rewriter.getI64IntegerAttr(size.value() + 1); } else { - // Sizes are the same, so add a new dimension with 'offset == 0', 'size == - // 2' and 'stride == offsetDiff'. - newOffsets.push_back(rewriter.getI64IntegerAttr(0)); - int64_t offsetDiff; - int64_t strideMultiplier; - for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) { - const OpFoldResult &offsetA = std::get<0>(iter.value()); - const OpFoldResult &offsetB = std::get<1>(iter.value()); - newOffsets.push_back(offsetA); - if (offsetA != offsetB) { - std::optional constOffsetA = getConstantIntValue(offsetA); - std::optional constOffsetB = getConstantIntValue(offsetB); - if (!constOffsetA || !constOffsetB) { - return emitError(rewriter.getUnknownLoc()) - << "differing offsets should be constants"; - } - offsetDiff = constOffsetB.value() - constOffsetA.value(); - std::optional maybeStride = - getConstantIntValue(stridesA[iter.index()]); - if (!maybeStride) { - return emitError(rewriter.getUnknownLoc()) - << "no constant stride found at the same index where the " - "offset " - "difference occurs"; + // Edge case for sizesA[0] != sizesB[0]. + if (sizesA[0] != sizesB[0]) { + newOffsets = offsetsA; + newSizes = sizesA; + newStrides = stridesA; + std::optional sizeA = getConstantIntValue(sizesA[0]); + std::optional sizeB = getConstantIntValue(sizesB[0]); + if (!sizeA || !sizeB) return failure(); + newSizes[0] = rewriter.getI64IntegerAttr(sizeA.value() + sizeB.value()); + } else { + // All dims of sizes are the same, so add a new dimension with + // 'offset == 0', 'size == 2' and 'stride == offsetDiff'. + newOffsets.push_back(rewriter.getI64IntegerAttr(0)); + int64_t offsetDiff; + int64_t strideMultiplier; + for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) { + const OpFoldResult &offsetA = std::get<0>(iter.value()); + const OpFoldResult &offsetB = std::get<1>(iter.value()); + newOffsets.push_back(offsetA); + if (offsetA != offsetB) { + std::optional constOffsetA = getConstantIntValue(offsetA); + std::optional constOffsetB = getConstantIntValue(offsetB); + if (!constOffsetA || !constOffsetB) { + return emitError(rewriter.getUnknownLoc()) + << "differing offsets should be constants"; + } + offsetDiff = constOffsetB.value() - constOffsetA.value(); + std::optional maybeStride = + getConstantIntValue(stridesA[iter.index()]); + if (!maybeStride) { + return emitError(rewriter.getUnknownLoc()) + << "no constant stride found at the same index where the " + "offset " + "difference occurs"; + } + strideMultiplier = maybeStride.value(); } - strideMultiplier = maybeStride.value(); } + newSizes.push_back(rewriter.getI64IntegerAttr(2)); + newSizes.append(sizesA.begin(), sizesA.end()); + newStrides.push_back( + rewriter.getI64IntegerAttr(offsetDiff * strideMultiplier)); + newStrides.append(stridesA.begin(), stridesA.end()); } - newSizes.push_back(rewriter.getI64IntegerAttr(2)); - newSizes.append(sizesA.begin(), sizesA.end()); - newStrides.push_back( - rewriter.getI64IntegerAttr(offsetDiff * strideMultiplier)); - newStrides.append(stridesA.begin(), stridesA.end()); - ; } assert(newOffsets.size() == newSizes.size() && "expected same number of new offsets and sizes"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp index 480dfda6d..8b56096c3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp @@ -48,7 +48,6 @@ LogicalResult bufferizeTemporaryMemrefs(Operation *parentOp) { }); } - // Note: we don't erase allocs/deallocs, we leave this for canonicalization. return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b5044f592..5d10b4d15 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -682,12 +682,11 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, passManager.addPass(createCSEPass()); { xilinx::air::AIRFuseChannelsOptions options; - std::vector mode; if (useTilePipeline == TilePassPipeline::PackPeelPipeline && matmulElementwiseFusion) { - mode.push_back("L1"); + const static llvm::SmallVector mode = {"L1"}; + options.clAggressiveMode = mode; } - options.clAggressiveMode = ArrayRef(mode); passManager.addPass(xilinx::air::createAIRFuseChannels(options)); } passManager.addPass(createCanonicalizerPass()); @@ -753,14 +752,13 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, // with given factors, and subsequently unrolled in // AIRUnrollOuterPerfectlyNestedLoopsPass, to enforce SHIM DMA BD count // within the hardware limit. - std::vector tile_sizes; if (useTilePipeline == TilePassPipeline::PackPeelPipeline) { - tile_sizes = {2, 2}; + const static llvm::SmallVector tile_sizes = {2, 2}; + options.clTileSizes = tile_sizes; } else if (useTilePipeline == TilePassPipeline::PadPackPipeline) { - tile_sizes = {4, 4}; - } else - tile_sizes = {}; - options.clTileSizes = ArrayRef(tile_sizes); + const static llvm::SmallVector tile_sizes = {4, 4}; + options.clTileSizes = tile_sizes; + } passManager.addNestedPass( xilinx::air::createAffineLoopOptPass(options)); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp index ab15b1fe4..0a015a08a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp @@ -111,6 +111,8 @@ TEST_F(AccessPatternCombinationTest, CombinableAccessPatterns) { EXPECT_TRUE(checkAreAccessPatternsCombinable({0, 2, 0}, {16, 16, 32}, {32, 64, 1}, {0, 2, 32}, {16, 16, 32}, {32, 64, 1}, 4)); + EXPECT_TRUE(checkAreAccessPatternsCombinable({32, 0}, {64, 64}, {128, 1}, + {96, 0}, {32, 64}, {128, 1}, 4)); // size(A) > size(B) EXPECT_TRUE(checkAreAccessPatternsCombinable( {0, 0, 0}, {2, 16, 32}, {32, 64, 1}, {0, 64}, {16, 32}, {64, 1}, 4)); @@ -168,6 +170,12 @@ TEST_F(AccessPatternCombinationTest, NonCombinableAccessPatterns) { {0, 0}, {16, 32}, {64, 1}, {0, 0, 96}, {2, 16, 32}, {32, 64, 1}, 4)); EXPECT_FALSE(checkAreAccessPatternsCombinable( {0, 0}, {16, 32}, {64, 1}, {0, 1, 0}, {2, 16, 32}, {32, 64, 1}, 4)); + + // size(A) == size(B) Incompatible offset + EXPECT_FALSE(checkAreAccessPatternsCombinable( + {32, 0}, {64, 64}, {128, 1}, {32, 0}, {32, 64}, {128, 1}, 4)); + EXPECT_FALSE(checkAreAccessPatternsCombinable( + {32, 0}, {32, 64}, {128, 1}, {96, 0}, {64, 64}, {128, 1}, 4)); } TEST_F(AccessPatternCombinationTest, CombineAccessPatterns) { @@ -197,6 +205,8 @@ TEST_F(AccessPatternCombinationTest, CombineAccessPatterns) { checkCombineAccessPatterns({8, 0, 0}, {16, 8, 16}, {16, 8, 1}, {40, 0, 0}, {16, 8, 16}, {16, 8, 1}, {0, 8, 0, 0}, {2, 16, 8, 16}, {512, 16, 8, 1}, 4); + checkCombineAccessPatterns({32, 0}, {64, 64}, {128, 1}, {96, 0}, {32, 64}, + {128, 1}, {32, 0}, {96, 64}, {128, 1}, 4); // size(A) > size(B) checkCombineAccessPatterns({0, 0}, {2, 32}, {64, 1}, {128}, {32}, {1}, {0, 0}, {3, 32}, {64, 1}, 3); @@ -255,6 +265,10 @@ TEST_F(AccessPatternCombinationTest, FailCombineAccessPatterns) { {3, 32}, {64, 1}, 3, false); checkCombineAccessPatterns({0}, {32}, {1}, {0, 96}, {2, 32}, {64, 1}, {0, 0}, {3, 32}, {64, 1}, 3, false); + + // size(A) == size(B) Incompatible offset + checkCombineAccessPatterns({32, 0}, {32, 64}, {128, 1}, {96, 0}, {64, 64}, + {128, 1}, {32, 0}, {96, 64}, {128, 1}, 4, false); } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir index fd0a49bc9..25dd958c8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir @@ -230,6 +230,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// CHECK-LABEL: @combine_source_same_dims_diff_sizes +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, 0] [128, 64] [128, 1]) +// CHECK-NOT: amdaie.npu.dma_cpy_nd +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @combine_source_same_dims_diff_sizes(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [32, 64] [128, 1]) + amdaie.npu.dma_cpy_nd %0([] [] [], [32, 0] [64, 64] [128, 1]) + amdaie.npu.dma_cpy_nd %0([] [] [], [96, 0] [32, 64] [128, 1]) + amdaie.end + } + } + return + } +} + +// ----- + // CHECK-LABEL: @combine_source_values // CHECK: %[[CONNECTION:.+]] = amdaie.connection // CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, 0, 0, 0] [2, 16, 8, 16] [32, 32, 8, 1]) @@ -332,6 +354,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// CHECK-LABEL: @combine_target_same_dims_diff_sizes +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([0, 0] [128, 64] [128, 1], [] [] []) +// CHECK-NOT: amdaie.npu.dma_cpy_nd +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @combine_target_same_dims_diff_sizes(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + amdaie.npu.dma_cpy_nd %0([0, 0] [32, 64] [128, 1], [] [] []) + amdaie.npu.dma_cpy_nd %0([32, 0] [64, 64] [128, 1], [] [] []) + amdaie.npu.dma_cpy_nd %0([96, 0] [32, 64] [128, 1], [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + // CHECK-LABEL: @combine_target_diff_dims // CHECK: %[[CONNECTION:.+]] = amdaie.connection // CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([0, 0, 0, 32] [3, 16, 8, 16] [64, 32, 8, 1], [] [] []) diff --git a/third_party/iree b/third_party/iree index 20a7638c1..2963600a6 160000 --- a/third_party/iree +++ b/third_party/iree @@ -1 +1 @@ -Subproject commit 20a7638c1584c98e1b2442a011c546f5d471631d +Subproject commit 2963600a6f5a3d7a4a998ce8c7d4f9e46391c0cc