From 57cd0bca89917e47639b9189683ac71b5d399f3c Mon Sep 17 00:00:00 2001 From: James Newling Date: Tue, 20 Aug 2024 23:14:16 -0700 Subject: [PATCH] Do not serialize-deserialize module before calling aie2xclbin (#685) This PR does a few things 1) Before pulling mlir-aie into iree-amd-aie, it was required to serialize-deserialize IR, to call the aie2xclbin program. But we now use aie2xclbin as a library function, not a shell out. So no serialization-deserialization needed. 2) This PR moves dma-to-npu closer to lower-to-aie pass. I think we'd eventually like to change the lowering of npu instructions from ``` amdaie dialect -> aie dialect -> npu ``` to ``` amdaie dialect -> npu ``` , because the amdaie and aie dialects are very similar and this indirection doesn't provide us with anything afaict. Making that change is currently not possible (dma-to-npu must currently run after stateful transform pass), this change is a step in that direction though Test changes: I removed some CHECKs for ` aiex.runtime_sequence` in tests/samples, because that's now sucked into the LX instructions (sequence of integers). In my mind the tests in tests/samples are only useful to check that compilation doesn't error/crash, so IMO removing CHECK lines there is fine. 3) General clean-up, for example we don't need aiex-to-standard pass anymore. --- .../AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp | 3 - .../target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp | 36 ++--- .../target/AMD-AIE/aie/AMDAIEXToStandard.cpp | 89 ----------- .../plugins/target/AMD-AIE/aie/CMakeLists.txt | 1 - compiler/plugins/target/AMD-AIE/aie/Passes.h | 4 +- .../target/AMD-AIE/aie/test/aiert_insts.mlir | 2 +- .../aie/test/aiex_standard_lowering.mlir | 22 --- .../target/AMD-AIE/aie/test/dma_to_npu.mlir | 23 --- .../iree-amd-aie/PluginRegistration.cpp | 1 - .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 93 +++++------ .../iree-amd-aie/Target/AMDAIETargetBCF.cpp | 7 +- .../Target/AMDAIETargetCDODirect.cpp | 12 +- .../Target/AMDAIETargetLdScript.cpp | 6 +- .../iree-amd-aie/Target/AMDAIETargets.h | 6 +- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 145 +++++++++++------- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.h | 12 +- .../Target/tests/aie_cdo_gen_test.cxx | 15 +- .../iree-amd-aie/Transforms/Passes.cpp | 32 ++-- tests/samples/conv_pipeline_e2e.mlir | 10 -- .../samples/matmul_peeled_objectfifo_e2e.mlir | 17 +- tests/samples/pack_peel_pipeline_matmul.mlir | 10 -- ...pack_peel_pipeline_matmul_elementwise.mlir | 34 +--- tests/samples/pad_pack_pipeline_e2e.mlir | 21 --- 23 files changed, 204 insertions(+), 397 deletions(-) delete mode 100644 compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp index d531ec619..89e7bfd8b 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp @@ -6,15 +6,12 @@ #include #include -#include #include #include "AIEDialect.h" #include "Passes.h" #include "iree-amd-aie/aie_runtime/iree_aie_router.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" -#include "llvm/ADT/DenseMapInfo.h" -#include "llvm/Support/raw_os_ostream.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp index f15990681..5e73be939 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp @@ -10,7 +10,6 @@ #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/Support/Format.h" #include "mlir/IR/AsmState.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -512,6 +511,7 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass { instructions[2] = count; instructions[3] = instructions.size() * sizeof(uint32_t); + ArrayRef instsArrRef(instructions.data(), instructions.size()); device->setAttr( "npu_instructions", @@ -521,24 +521,24 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass { IntegerType::get(&getContext(), 32, IntegerType::Unsigned)), "npu_instructions", HeapAsmResourceBlob::allocateAndCopyInferAlign(instsArrRef))); - // The LX instructions for the entry point function are already generated by - // the pass hence we can safely delete the function as it is of no use to - // us. A reason to do this is that otherwise it is unceseccarily lowered to - // llvm where it can have a chance to crash in case the argument list is not - // lowerable for reasons such as memref's with dynamic offsets. - auto symName = dyn_cast_or_null(device->getAttr("sym_name")); + SmallVector seqOps; - device->walk([&](RuntimeSequenceOp seqOp) { - // if the deviceOp has a symbol name attached to it we look for the - // sequence op that partically matches that symbol, if not we collect all - // sequenceOps. - if (!symName || - symName.str().find(seqOp.getSymName()->str()) != std::string::npos) - seqOps.push_back(seqOp); - }); - // If exactly one entry point function is found we can delete it. For any - // other result we do not make any change. - if (seqOps.size() == 1) seqOps[0].erase(); + device->walk([&](RuntimeSequenceOp seqOp) { seqOps.push_back(seqOp); }); + + if (seqOps.size() > 1) { + device->emitOpError("has ") + << seqOps.size() + << " aiex.runtime_sequence ops. Expected no more than 1."; + signalPassFailure(); + } + + if (seqOps.size() == 1) { + auto seqOp = seqOps[0]; + StringRef name = seqOp.getSymName().value(); + device->setAttr("runtime_sequence_name", + StringAttr::get(&getContext(), name)); + seqOp.erase(); + } } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp deleted file mode 100644 index 4e0de9c09..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "AIEXDialect.h" -#include "Passes.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Transforms/DialectConversion.h" - -using namespace mlir; -using namespace xilinx; -using namespace xilinx::AIE; -using namespace xilinx::AIEX; - -#define DEBUG_TYPE "amdaiex-standard-lowering" - -template -struct AMDAIEXOpRemoval : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - using OpAdaptor = typename MyAIEXOp::Adaptor; - ModuleOp &module; - - AMDAIEXOpRemoval(MLIRContext *context, ModuleOp &m, - PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), module(m) {} - - LogicalResult matchAndRewrite( - MyAIEXOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Operation *Op = op.getOperation(); - rewriter.eraseOp(Op); - return success(); - } -}; - -namespace mlir::iree_compiler::AMDAIE { -struct AMDAIEXToStandardPass : mlir::OperationPass { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AMDAIEXToStandardPass) - - AMDAIEXToStandardPass() - : mlir::OperationPass(resolveTypeID()) {} - - llvm::StringRef getArgument() const override { - return "amdaiex-standard-lowering"; - } - - llvm::StringRef getName() const override { return "AMDAIEXToStandardPass"; } - - std::unique_ptr clonePass() const override { - return std::make_unique( - *static_cast(this)); - } - - void getDependentDialects(::mlir::DialectRegistry ®istry) const override { - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - } - - void runOnOperation() override { - ModuleOp m = getOperation(); - ConversionTarget target(getContext()); - RewritePatternSet removepatterns(&getContext()); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - - if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) - signalPassFailure(); - } -}; - -std::unique_ptr> createAMDAIEXToStandardPass() { - return std::make_unique(); -} - -void registerAMDAIEXToStandardPass() { - mlir::registerPass([]() -> std::unique_ptr { - return createAMDAIEXToStandardPass(); - }); -} -} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt index db5c1e449..52244c48a 100644 --- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt @@ -142,7 +142,6 @@ iree_cc_library( AMDAIELocalizeLocks.cpp AMDAIENormalizeAddressSpaces.cpp AMDAIEObjectFifoStatefulTransform.cpp - AMDAIEXToStandard.cpp DEPS iree-amd-aie::aie_runtime::iree_aie_runtime_static ::AIEDialectIR diff --git a/compiler/plugins/target/AMD-AIE/aie/Passes.h b/compiler/plugins/target/AMD-AIE/aie/Passes.h index 347c32757..bf9e64477 100644 --- a/compiler/plugins/target/AMD-AIE/aie/Passes.h +++ b/compiler/plugins/target/AMD-AIE/aie/Passes.h @@ -34,7 +34,6 @@ createAMDAIEPathfinderPass(); std::unique_ptr> createAMDAIECoreToStandardPass(); std::unique_ptr> createAMDAIEDmaToNpuPass(); -std::unique_ptr> createAMDAIEXToStandardPass(); void registerAMDAIEAssignBufferAddressesBasic(); void registerAMDAIEAssignBufferDescriptorIDs(); @@ -44,9 +43,8 @@ void registerAMDAIELocalizeLocks(); void registerAMDAIENormalizeAddressSpaces(); void registerAMDAIEObjectFifoStatefulTransform(); void registerAMDAIERoutePathfinderFlows(); - void registerAMDAIEDmaToNpu(); -void registerAMDAIEXToStandardPass(); + } // namespace mlir::iree_compiler::AMDAIE #endif // AMDAIE_PASSES_H_ diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir index cb82fcd22..adaff90b4 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir @@ -5,7 +5,7 @@ // CHECK: memref.global "public" @of_fromMem : memref<32xi32> // CHECK: aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0) // CHECK: aie.shim_dma_allocation @of_toMem(S2MM, 0, 0) -// CHECK: } {npu_instructions = dense_resource : tensor<64xui32>} +// CHECK: } {npu_instructions = dense_resource : tensor<64xui32>, runtime_sequence_name = "sequence"} // CHECK: {-# // CHECK: dialect_resources: { diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir deleted file mode 100644 index 015aea837..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir +++ /dev/null @@ -1,22 +0,0 @@ - -// RUN: iree-opt --amdaiex-standard-lowering %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @toMem : memref<16xi32> -// CHECK: func.func @dma_and_wait(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: aie.shim_dma_allocation @toMem(MM2S, 1, 1) -// CHECK: } - -module { - aie.device(npu1_4col) { - memref.global "public" @toMem : memref<16xi32> - func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.npu.dma_wait {symbol = @toMem} - return - } - aie.shim_dma_allocation @toMem (MM2S, 1, 1) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir index f4cfd5647..3a78c854c 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir @@ -77,26 +77,3 @@ module { } {sym_name = "explicit_sym_name_0"} } -// ----- - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @toMem : memref<16xi32> -// CHECK: func.func @pretend_microkernel -// CHECK: aiex.runtime_sequence @explicit_sym_name -// CHECK: aie.shim_dma_allocation @toMem(MM2S, 1, 1) - -module { - aie.device(npu1_4col) { - memref.global "public" @toMem : memref<16xi32> - func.func @pretend_microkernel(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - return - } - - aiex.runtime_sequence @explicit_sym_name(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.npu.dma_wait {symbol = @toMem} - } - aie.shim_dma_allocation @toMem (MM2S, 1, 1) - } {sym_name = "wrong_sym_name"} -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 2ef29d294..50d72b077 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -34,7 +34,6 @@ struct AMDAIESession AMDAIE::registerAMDAIEObjectFifoStatefulTransform(); AMDAIE::registerAMDAIERoutePathfinderFlows(); AMDAIE::registerAMDAIEDmaToNpu(); - AMDAIE::registerAMDAIEXToStandardPass(); AMDAIE::registerAIRConversionPasses(); AMDAIE::registerAIRTransformPasses(); aievec::registerConvertAIEVecToLLVMPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index 8f764732a..515cc5be1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -16,7 +16,6 @@ #include "aievec/XLLVMDialect.h" #include "air/Dialect/AIR/AIRDialect.h" #include "air/Dialect/AIRRt/AIRRtDialect.h" -#include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" @@ -27,9 +26,12 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Support/ToolOutputFile.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" -#include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" #include "mlir/Dialect/Func/Extensions/AllExtensions.h" @@ -75,42 +77,27 @@ static llvm::cl::opt clEnableAMDAIEUkernels( "unprefixed microkernels to enable, e.g. `matmul`."), llvm::cl::init("none")); -// Utility to find aie.device Op corresponding to the export Op. -// For example, we have -// hal.executable.variant { -// hal.executable.export symbol1 -// hal.executable.export symbol2 -// module { -// aie.device { -// ... -// aiex.runtime_sequence symbol1 -// } -// aie.device { -// ... -// aiex.runtime_sequence symbol2 -// } -// } -// } -// Hence we need to find the aiex.runtime_sequence that coresponds to the export -// op symbol and return its parent aie.device Op. This is what we will pass to -// the `aie2xclbin` tool for artifact generation per entry point. -static xilinx::AIE::DeviceOp getDeviceOpFromEntryPoint(ModuleOp moduleOp, - StringRef exportOpName) { +static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp, + StringRef targetName) { xilinx::AIE::DeviceOp deviceOp; - moduleOp.walk([&](xilinx::AIEX::RuntimeSequenceOp sequenceOp) { - if (sequenceOp.getSymName() == exportOpName) { - deviceOp = - dyn_cast_or_null(sequenceOp->getParentOp()); - return WalkResult::interrupt(); - } - return WalkResult::advance(); + uint32_t nDeviceOpsVisited = 0; + moduleOp.walk([&](xilinx::AIE::DeviceOp d) { + ++nDeviceOpsVisited; + // This attribute should've been set in the dma-to-npu pass. + auto maybeName = d->getAttrOfType("runtime_sequence_name"); + if (!maybeName) return WalkResult::advance(); + auto name = maybeName.getValue(); + if (name != targetName) return WalkResult::advance(); + deviceOp = d; + return WalkResult::interrupt(); }); - if (!deviceOp) { - moduleOp.emitError() - << "failed to find aie.device containing func.func with symbol " - << exportOpName; - } + + if (!deviceOp) + moduleOp.emitError() << "visited " << nDeviceOpsVisited + << " aie.device ops, and failed to find one with name " + << targetName; + return deviceOp; } @@ -291,7 +278,7 @@ LogicalResult AIETargetBackend::serializeExecutable( } StringRef exportOpName = exportOp.getSymName(); - deviceOps.push_back(getDeviceOpFromEntryPoint(moduleOp, exportOpName)); + deviceOps.push_back(getDeviceOpWithName(moduleOp, exportOpName)); // The xclbin kernel name, appended with instance name suffix (`:MLIRAIEV1`, // 10 chars) is required by the xclbinutil to have a length smaller or equal @@ -334,21 +321,8 @@ LogicalResult AIETargetBackend::serializeExecutable( uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]); entryPointNamesFb[ordinal] = entryPointNames[i]; - - SmallString<128> inputMlirPath(workDir); - llvm::sys::path::append(inputMlirPath, - entryPointNamesFb[ordinal] + ".aiecc.mlir"); - std::string errorMessage; - { - auto inputMlirOut = openOutputFile(inputMlirPath, &errorMessage); - if (!inputMlirOut) { - return moduleOp.emitOpError() - << "Failed to write MLIR: " << errorMessage; - } - deviceOps[i].print(inputMlirOut->os(), OpPrintingFlags().useLocalScope()); - inputMlirOut->keep(); - } + // we add the entry point to the working directory for xclbin artifacts if // there are multiple entry points so that we dont overwrite the xclbinutil // generated artifacts e.g kernels.json, for different entry points which @@ -375,11 +349,22 @@ LogicalResult AIETargetBackend::serializeExecutable( ParserConfig pcfg(variantOp->getContext()); llvm::SourceMgr srcMgr; - OwningOpRef owningModuleOp = - parseSourceFile(inputMlirPath, srcMgr, pcfg); + // Move DeviceOp into its own ModuleOp, if there are multiple DeviceOps. + // Required as core-to-standard pass will move all ops in DeviceOps into + // the parent ModuleOp, so if they're not separated, core code between + // DeviceOps gets incorrectly concatenated. There's probably a simpler + // workaround, to be reviewed as we continue to remove layers of crust. + if (deviceOps.size() > 1) { + OpBuilder opBuilder(deviceOps[i].getContext()); + auto moduleWithOneDevice = + opBuilder.create(deviceOps[i].getLoc()); + opBuilder.setInsertionPointToStart(moduleWithOneDevice.getBody()); + Operation *repl = opBuilder.clone(*deviceOps[i].getOperation()); + deviceOps[i] = cast(repl); + } if (failed(aie2xclbin( - /*ctx=*/variantOp->getContext(), /*moduleOp=*/*owningModuleOp, + /*ctx=*/variantOp->getContext(), deviceOps[i], /*outputNPU=*/npuInstPath.str().str(), /*outputXCLBin=*/xclbinPath.str().str(), /*printIRBeforeAll=*/options.aie2xclbinPrintIrBeforeAll, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp index 48fc13527..33a1567dc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp @@ -7,7 +7,6 @@ #include "AMDAIETargets.h" #include "aie/AIEDialect.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/IR/Module.h" using namespace mlir; using namespace xilinx; @@ -17,15 +16,11 @@ std::string utohexstr(uint32_t u) { return "0x" + llvm::utohexstr(u); } namespace mlir::iree_compiler::AMDAIE { -LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output, +LogicalResult AIETranslateToBCF(DeviceOp deviceOp, raw_ostream &output, int tileCol, int tileRow) { DenseMap tiles; DenseMap> buffers; - if (module.getOps().empty()) - module.emitOpError("expected aie.device operation at toplevel"); - DeviceOp deviceOp = *(module.getOps().begin()); - collectTiles(deviceOp, tiles); collectBuffers(deviceOp, buffers); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp index 7678f848a..29216d069 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -#include // uint +#include #include #include #include @@ -17,12 +17,9 @@ #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "mlir/IR/Block.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/Support/LLVM.h" #include "mlir/Support/LogicalResult.h" @@ -341,14 +338,11 @@ LogicalResult generateCDOBinariesSeparately( return success(); } -LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, +LogicalResult AIETranslateToCDODirect(xilinx::AIE::DeviceOp device, + llvm::StringRef workDirPath, bool bigEndian, bool emitUnified, bool cdoDebug, bool aieSim, bool enableCores) { - auto devOps = m.getOps(); - assert(llvm::range_size(devOps) == 1 && - "only exactly 1 device op supported."); - DeviceOp device = *devOps.begin(); AMDAIEDeviceModel deviceModel = getDeviceModel(device.getDevice()); byte_ordering endianness = bigEndian ? byte_ordering::Big_Endian : byte_ordering::Little_Endian; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp index a106f1e53..5cbebf39e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp @@ -50,14 +50,10 @@ static void writeLDScriptMap(raw_ostream &output, BufferOp buf, int offset) { // .bss : { *(.bss) } > data // } LogicalResult mlir::iree_compiler::AMDAIE::AIETranslateToLdScript( - ModuleOp module, raw_ostream &output, int tileCol, int tileRow) { + DeviceOp deviceOp, raw_ostream &output, int tileCol, int tileRow) { DenseMap tiles; DenseMap> buffers; - if (module.getOps().empty()) { - module.emitOpError("expected AIE.device operation at toplevel"); - } - DeviceOp deviceOp = *(module.getOps().begin()); collectTiles(deviceOp, tiles); ::collectBuffers(deviceOp, buffers); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h index 5052fadd8..90a16e72a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h @@ -17,16 +17,16 @@ namespace mlir::iree_compiler::AMDAIE { std::vector AIETranslateToNPU(mlir::ModuleOp); -mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module, +mlir::LogicalResult AIETranslateToLdScript(xilinx::AIE::DeviceOp, llvm::raw_ostream &output, int tileCol, int tileRow); -mlir::LogicalResult AIETranslateToBCF(mlir::ModuleOp module, +mlir::LogicalResult AIETranslateToBCF(xilinx::AIE::DeviceOp, llvm::raw_ostream &output, int tileCol, int tileRow); mlir::LogicalResult AIETranslateToCDODirect( - mlir::ModuleOp m, llvm::StringRef workDirPath, bool bigEndian = false, + xilinx::AIE::DeviceOp, llvm::StringRef workDirPath, bool bigEndian = false, bool emitUnified = false, bool cdoDebug = false, bool aieSim = false, bool enableCores = true); } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index f2bdc6a33..b7c2b1578 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -18,6 +18,8 @@ #include "aievec/Passes.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree/compiler/Utils/ToolUtils.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/JSON.h" @@ -26,6 +28,7 @@ #include "llvm/Support/Program.h" #include "llvm/Support/ToolOutputFile.h" #include "mlir/IR/AsmState.h" +#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" @@ -356,12 +359,12 @@ static std::optional runTool( << "\n"; return {}; } - auto outputFromFile = maybeOutputFromFile.value(); + const std::string &outputFromFile = maybeOutputFromFile.value(); if (verbose) { - auto totalTime = std::chrono::duration_cast>( - stats.TotalTime) - .count(); + float totalTime = std::chrono::duration_cast>( + stats.TotalTime) + .count(); std::string exitStatusStr = result == 0 ? "Succeeded" : "Failed"; llvm::outs() << "\n" << exitStatusStr << " in totalTime " << totalTime @@ -432,7 +435,7 @@ static LogicalResult assembleFileUsingPeano( args.emplace_back("--target=aie2-none-unknown-elf"); std::vector peanoArgs = makePeanoOptArgs(); args.reserve(args.size() + peanoArgs.size()); - for (const auto &item : peanoArgs) { + for (const std::string &item : peanoArgs) { args.emplace_back("-mllvm"); args.emplace_back(item); } @@ -498,19 +501,13 @@ static_assert(std::is_same_v vitisDir, const std::string &targetArch, bool verbose, - Path peanoDir, const std::optional &ukernel) { - auto deviceOps = moduleOp.getOps(); - if (!llvm::hasSingleElement(deviceOps)) - return moduleOp.emitOpError("expected a single device op"); - - AIE::DeviceOp deviceOp = *deviceOps.begin(); + AIE::DeviceOp deviceOp, const std::string &objFile, Path tempDir, + bool useChess, std::optional vitisDir, const std::string &targetArch, + bool verbose, Path peanoDir, const std::optional &ukernel) { auto tileOps = deviceOp.getOps(); - std::string errorMessage; - for (auto tileOp : tileOps) { + for (AIE::TileOp tileOp : tileOps) { int col = tileOp.getCol(); int row = tileOp.getRow(); auto coreOp = AIE::getCoreOp(tileOp); @@ -580,7 +577,7 @@ static LogicalResult generateCoreElfFiles( } if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToBCF( - moduleOp, bcfOutput->os(), col, row))) { + deviceOp, bcfOutput->os(), col, row))) { llvm::errs() << "Failed to generate BCF"; return failure(); } @@ -614,7 +611,7 @@ static LogicalResult generateCoreElfFiles( return failure(); } if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToLdScript( - moduleOp, ldscriptOutput->os(), col, row))) { + deviceOp, ldscriptOutput->os(), col, row))) { llvm::errs() << "failed to generate ld script for core (" << col << "," << row << ")"; return failure(); @@ -646,24 +643,28 @@ static LogicalResult generateCoreElfFiles( return success(); } -static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp, +static LogicalResult generateCDO(MLIRContext *context, AIE::DeviceOp deviceOp, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const Path &tempDir) { - ModuleOp copy = moduleOp.clone(); + + auto copy = cast(deviceOp.getParentOp()->clone()); + deviceOp = *copy.getOps().begin(); + std::string errorMessage; - PassManager passManager(context, ModuleOp::getOperationName()); + PassManager passManager(context, AIE::DeviceOp::getOperationName()); applyConfigToPassManager(passManager, printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing); - passManager.addNestedPass( + passManager.addPass( mlir::iree_compiler::AMDAIE::createAMDAIEPathfinderPass()); - if (failed(passManager.run(copy))) { + + if (failed(passManager.run(deviceOp))) { llvm::errs() << "failed to run passes to prepare for XCLBin generation"; return failure(); } if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToCDODirect( - copy, tempDir.string()))) { + deviceOp, tempDir.string()))) { llvm::errs() << "failed to emit CDO"; return failure(); } @@ -1029,17 +1030,22 @@ struct RemoveAlignment2FromLLVMLoadPass } // namespace static LogicalResult generateUnifiedObject( - MLIRContext *context, ModuleOp moduleOp, const std::string &outputFile, + MLIRContext *context, AIE::DeviceOp deviceOp, const std::string &outputFile, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, bool useChess, bool verbose, Path tempDir, std::optional vitisDir, const std::string &targetArch, Path peanoDir) { - PassManager pm(context, moduleOp.getOperationName()); + assert(deviceOp->getParentOp() && isa(deviceOp->getParentOp()) && + "DeviceOp must be in a module parent"); + + ModuleOp moduleOpCopy = cast(deviceOp->getParentOp()).clone(); + + PassManager pm(context, moduleOpCopy.getOperationName()); applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing); pm.addPass(mlir::iree_compiler::AMDAIE::createAMDAIECoreToStandardPass()); - pm.addPass(mlir::iree_compiler::AMDAIE::createAMDAIEXToStandardPass()); + // Convert specific vector dialect ops (like vector.contract) to the AIEVec // dialect mlir::iree_compiler::aievec::buildConvertVectorToAIEVec(pm); @@ -1052,14 +1058,15 @@ static LogicalResult generateUnifiedObject( llvm::outs() << "\n"; } - ModuleOp copy = moduleOp.clone(); - if (failed(pm.run(copy))) - return moduleOp.emitOpError("Failed to lower to LLVM"); + if (failed(pm.run(moduleOpCopy))) + return deviceOp.emitOpError("Failed to lower to LLVM"); llvm::LLVMContext llvmContext; - auto llvmModule = translateModuleToLLVMIR(copy, llvmContext); - if (!llvmModule) - return moduleOp.emitOpError("Failed to translate module to LLVMIR"); + auto llvmModule = translateModuleToLLVMIR(moduleOpCopy, llvmContext); + if (!llvmModule) { + return deviceOp.emitOpError("Failed to translate module to LLVMIR"); + } + std::string inputLLStr; { llvm::raw_string_ostream rso(inputLLStr); @@ -1081,7 +1088,9 @@ static LogicalResult generateUnifiedObject( /*workDir=*/tempDir, /*vitisDir=*/*maybeVitisDir, /*verbose=*/verbose); - if (failed(chessIntrinsicsObjFile)) return failure(); + if (failed(chessIntrinsicsObjFile)) { + return failure(); + } } else { Path LLVMIRFile = tempDir / "input.ll"; if (auto maybeErr = dumpStrToDisk(inputLLStr, LLVMIRFile.string()); @@ -1116,12 +1125,37 @@ static LogicalResult generateUnifiedObject( return failure(); } } - copy->erase(); + + moduleOpCopy->erase(); return success(); } +FailureOr> getNpuInstructions(AIE::DeviceOp deviceOp) { + MLIRContext *ctx = deviceOp.getContext(); + mlir::Attribute maybeNpuInstructions = deviceOp->getAttr("npu_instructions"); + if (!maybeNpuInstructions) { + return emitError(UnknownLoc::get(ctx), + "Expected npu_instructions attribute on aie.device"); + } + auto npuInstructions = + dyn_cast(maybeNpuInstructions); + if (!npuInstructions) { + return emitError( + UnknownLoc::get(ctx), + "Failed to cast npu_instructions to DenseUI32ResourceElementsAttr"); + } + std::optional> maybeArrayRef = + npuInstructions.tryGetAsArrayRef(); + if (!maybeArrayRef.has_value()) { + return emitError( + UnknownLoc::get(ctx), + "Failed getting values for npu_instructions in tryGetAsArrayRef"); + } + return maybeArrayRef.value(); +} + LogicalResult aie2xclbin( - MLIRContext *ctx, ModuleOp moduleOp, const std::string &outputNPU, + MLIRContext *ctx, AIE::DeviceOp deviceOp, const std::string &outputNPU, const std::string &outputXCLBin, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const std::string &tempDir, bool useChess, bool verbose, @@ -1131,22 +1165,19 @@ LogicalResult aie2xclbin( const std::string &amdAIEInstallDir, const std::optional &InputXCLBin, const std::optional &ukernel) { - PassManager pm(ctx, mlir::ModuleOp::getOperationName()); + PassManager pm(ctx, AIE::DeviceOp::getOperationName()); applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing); - // generateNPUInstructions - pm.addNestedPass( - mlir::iree_compiler::AMDAIE::createAMDAIEDmaToNpuPass()); - if (failed(pm.run(moduleOp))) - return moduleOp.emitOpError(": NPU Instruction pipeline failed"); - - std::optional> npuInstructions = - cast( - (*moduleOp.getOps().begin()) - ->getAttr("npu_instructions")) - .tryGetAsArrayRef(); - if (!npuInstructions) - return moduleOp.emitOpError(": No NPU instructions in device op"); + if (failed(pm.run(deviceOp))) + return deviceOp.emitOpError(": NPU Instruction pipeline failed"); + + FailureOr> maybeNpuInstructions = + getNpuInstructions(deviceOp); + if (failed(maybeNpuInstructions)) { + assert(false && "Failed to get NPU instructions"); + return failure(); + } + ArrayRef npuInstructions = maybeNpuInstructions.value(); std::string errorMessage; auto output = openOutputFile(outputNPU, &errorMessage); @@ -1155,29 +1186,29 @@ LogicalResult aie2xclbin( << errorMessage; return failure(); } - for (auto w : *npuInstructions) output->os() << llvm::format("%08X\n", w); + for (uint32_t w : npuInstructions) output->os() << llvm::format("%08X\n", w); output->keep(); Path unifiedObj = Path(tempDir) / "input.o"; if (failed(generateUnifiedObject( - ctx, moduleOp, unifiedObj.string(), printIRBeforeAll, printIRAfterAll, + ctx, deviceOp, unifiedObj.string(), printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing, useChess, verbose, tempDir, vitisDir, targetArch, peanoDir))) - return moduleOp.emitOpError("Failed to generate unified object"); + return deviceOp.emitOpError("Failed to generate unified object"); - if (failed(generateCoreElfFiles(moduleOp, unifiedObj.string(), tempDir, + if (failed(generateCoreElfFiles(deviceOp, unifiedObj.string(), tempDir, useChess, vitisDir, targetArch, verbose, peanoDir, ukernel))) - return moduleOp.emitOpError("Failed to generate core ELF file(s)"); + return deviceOp.emitOpError("Failed to generate core ELF file(s)"); - if (failed(generateCDO(ctx, moduleOp, printIRBeforeAll, printIRAfterAll, + if (failed(generateCDO(ctx, deviceOp, printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing, tempDir))) - return moduleOp.emitOpError("Failed to generate CDO"); + return deviceOp.emitOpError("Failed to generate CDO"); if (failed(generateXCLBin(outputXCLBin, tempDir, xclBinKernelID, xclBinKernelName, xclBinInstanceName, amdAIEInstallDir, verbose, InputXCLBin))) - return moduleOp.emitOpError("Failed to generate XCLBin"); + return deviceOp.emitOpError("Failed to generate XCLBin"); return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h index 705e97d4f..290064170 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h @@ -7,17 +7,15 @@ #include -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringRef.h" -#include "mlir/IR/BuiltinOps.h" +#include "aie/AIEDialect.h" #include "mlir/IR/MLIRContext.h" #include "mlir/Support/LogicalResult.h" mlir::LogicalResult aie2xclbin( - mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp, - const std::string &outputNPU, const std::string &outputXCLBin, - bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, - bool timing, const std::string &tempDir, bool useChess, bool verbose, + mlir::MLIRContext *ctx, xilinx::AIE::DeviceOp, const std::string &outputNPU, + const std::string &outputXCLBin, bool printIRBeforeAll, + bool printIRAfterAll, bool printIRModuleScope, bool timing, + const std::string &tempDir, bool useChess, bool verbose, const std::optional &vitisDir, const std::string &targetArch, const std::string &peanoDir, const std::string &xclBinKernelID, const std::string &xclBinKernelName, const std::string &xclBinInstanceName, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx index fcc0d39d7..7ea4b8269 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx @@ -10,10 +10,7 @@ #include "aie/AIEDialect.h" #include "aie/AIEXDialect.h" #include "iree-amd-aie/Target/AMDAIETargets.h" -#include "iree-amd-aie/Target/XCLBinGen.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Path.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Parser/Parser.h" @@ -43,11 +40,19 @@ int main(int argc, char **argv) { mlir::ParserConfig parserConfig(&context); auto moduleOp = llvm::cast( mlir::parseSourceFile(mlirAbsPath, parserConfig).release()); + + auto deviceOps = moduleOp.getOps(); + auto nDeviceOps = std::distance(deviceOps.begin(), deviceOps.end()); + if (nDeviceOps != 1){ + std::cerr << "Error: Expected exactly one xilinx.aie.device op\n"; + return 1; + } + auto deviceOp = *deviceOps.begin(); llvm::DebugFlag = true; const char *debugTypes[3] = {"aie-generate-cdo", "iree-aie-runtime", "iree-aie-cdo-emitter"}; llvm::setCurrentDebugTypes(debugTypes, 3); - auto status = AIETranslateToCDODirect(moduleOp, workDir, false, false, false); + auto status = AIETranslateToCDODirect(deviceOp, workDir, false, false, false); std::vector diagnostics; ScopedDiagnosticHandler handler(moduleOp.getContext(), [&](Diagnostic &d) { llvm::raw_string_ostream(diagnostics.emplace_back()) @@ -59,7 +64,7 @@ int main(int argc, char **argv) { llvm::DebugFlag = false; llvm::setCurrentDebugType("aie-cdo-driver-debug"); - status = AIETranslateToCDODirect(moduleOp, workDir, false, false, true); + status = AIETranslateToCDODirect(deviceOp, workDir, false, false, true); if (failed(status)) for (const auto &diagnostic : diagnostics) std::cerr << diagnostic << "\n"; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b4a0e502d..a1bd27d91 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -621,8 +621,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIELowerToAIEPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createConvertLinalgToLoopsPass()); - // Now lower using the AIE passes from MLIR-AIE. addMLIRAIELoweringPasses(passManager); } @@ -788,18 +786,28 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { } void addMLIRAIELoweringPasses(OpPassManager &passManager) { + { + OpPassManager &devicePM = passManager.nest(); + devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass()); + devicePM.addPass(createCanonicalizerPass()); + devicePM.addPass(createAMDAIEDmaToNpuPass()); + devicePM.addPass(createAMDAIEAssignLockIDsPass()); + devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); + devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); + devicePM.addPass(createAMDAIEPathfinderPass()); + } + + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createConvertLinalgToLoopsPass()); passManager.addPass(createLowerAffinePass()); - OpPassManager &devicePM = passManager.nest(); - devicePM.addPass(createAMDAIEAssignLockIDsPass()); - devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass()); - devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); - devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); - devicePM.addPass(createAMDAIEPathfinderPass()); passManager.addPass(createConvertSCFToCFPass()); - passManager.addNestedPass( - createAMDAIELocalizeLocksPass()); - passManager.addNestedPass( - createAMDAIENormalizeAddressSpacesPass()); + + { + OpPassManager &devicePM = passManager.nest(); + devicePM.addPass(createAMDAIELocalizeLocksPass()); + devicePM.addPass(createAMDAIENormalizeAddressSpacesPass()); + devicePM.addPass(createCanonicalizerPass()); + } } // NOTE: this runs on the top-level program module containing all hal.executable diff --git a/tests/samples/conv_pipeline_e2e.mlir b/tests/samples/conv_pipeline_e2e.mlir index 71b1442b8..7c6957017 100644 --- a/tests/samples/conv_pipeline_e2e.mlir +++ b/tests/samples/conv_pipeline_e2e.mlir @@ -13,11 +13,6 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32 // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%arg0: memref<2x14x14x32xi32>, %arg1: memref<3x3x32x64xi32>, %arg2: memref<2x12x12x64xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync // ----- @@ -34,8 +29,3 @@ func.func @conv_2d_nhwc_hwcf_q(%arg0: tensor<2x14x14x32xi8>, %arg1: tensor<3x3x3 // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_q_dispatch_0_conv_2d_nhwc_hwcf_q_2x12x12x64x3x3x32_i8xi8xi32xi32xi32(%arg0: memref<3136xi32>, %arg1: memref<4608xi32>, %arg2: memref<2x12x12x64xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir index 386214f58..484494045 100644 --- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir +++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir @@ -1,6 +1,8 @@ // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s // CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32 + +// CHECK: aie.device(npu1_4col) { // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) // CHECK-DAG: %[[TILE_0_3:.+]] = aie.tile(0, 3) // CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) @@ -11,21 +13,16 @@ // CHECK-DAG: aie.core(%[[TILE_1_2]]) // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) -// CHECK-DAG: aiex.runtime_sequence @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>) -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 8, 64, 32][0, 32, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ0]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][8, 2, 32, 32][4096, 32, 128, 1]) {id = 2 : i64, issue_token = true, metadata = @[[OBJ1:.+]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ1]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ10]]} -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) // CHECK-DAG: aie.mem(%[[TILE_1_2]]) // CHECK-DAG: aie.mem(%[[TILE_1_3]]) -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ10]](S2MM, 0, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(S2MM, 0, 0) +// CHECK: {npu_instructions = +// CHECK-SAME: runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32" func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32> { %cst = arith.constant 0 : i32 diff --git a/tests/samples/pack_peel_pipeline_matmul.mlir b/tests/samples/pack_peel_pipeline_matmul.mlir index 344c34e5d..a626a2132 100644 --- a/tests/samples/pack_peel_pipeline_matmul.mlir +++ b/tests/samples/pack_peel_pipeline_matmul.mlir @@ -15,11 +15,6 @@ func.func @matmul_i8_i32(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>) -> tens // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_i8_i32_dispatch_0_matmul_32x32x16_i8xi8xi32(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<32x32xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync // ----- @@ -38,8 +33,3 @@ func.func @matmul_bf16(%lhs: tensor<16x32xbf16>, %rhs: tensor<32x16xbf16>) -> te // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_bf16_dispatch_0_matmul_16x16x32_bf16(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<128xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync diff --git a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir index 9c2cbf935..c99b3b269 100644 --- a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir +++ b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir @@ -20,15 +20,8 @@ func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1 } // CHECK-LABEL: hal.executable.export public @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32(%arg0: memref<1024x512xi32>, %arg1: memref<512x1024xi32>, %arg2: memref<1024x1024xi32>, %arg3: memref<1024x1024xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation // ----- @@ -52,15 +45,8 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens } // CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>, %arg2: memref<1024xf32>, %arg3: memref<1024x1024xf32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation // ----- func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> { @@ -78,12 +64,6 @@ func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<51 return %11 : tensor<512x16384xbf16> } -// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32(%arg0: memref<131072xi32>, %arg1: memref<4194304xi32>, %arg2: memref<512xf32>, %arg3: memref<4194304xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd +// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32 +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation diff --git a/tests/samples/pad_pack_pipeline_e2e.mlir b/tests/samples/pad_pack_pipeline_e2e.mlir index 18d9d8708..14bdcb04c 100644 --- a/tests/samples/pad_pack_pipeline_e2e.mlir +++ b/tests/samples/pad_pack_pipeline_e2e.mlir @@ -7,11 +7,6 @@ // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_small_dispatch_0_matmul_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x32xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @matmul_small(%lhs : tensor<8x16xi32>, %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> { %empty = tensor.empty() : tensor<8x32xi32> @@ -29,12 +24,6 @@ func.func @matmul_small(%lhs : tensor<8x16xi32>, // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_large_dispatch_0_matmul_2048x2048x2048_i32(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>, %arg2: memref<2048x2048xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync - func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>) -> tensor<2048x2048xi32> { %empty = tensor.empty() : tensor<2048x2048xi32> %cst = arith.constant 0 : i32 @@ -54,11 +43,6 @@ func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32> // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @generic_matmul_transpose_static_dispatch_0_matmul_like_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>, %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> { %cst = arith.constant 0 : i32 @@ -82,11 +66,6 @@ func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>, // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_transpose_b_static_dispatch_0_matmul_transpose_b_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @matmul_transpose_b_static(%lhs : tensor<8x16xi32>, %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> { %cst = arith.constant 0 : i32