diff --git a/CMakeLists.txt b/CMakeLists.txt index a34af1e3a2..c977f4056b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,6 +180,8 @@ find_library(ELF_LIB elf) cmake_dependent_option(AIE_ENABLE_AIRBIN "Enables emitting AIRBIN ELF binaries." OFF "ELF_LIB" OFF) +# If we need runtime libs, then statically link them. +set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") add_flag_if_supported("-Werror=sign-compare" WERROR_SIGN_COMPARE) add_flag_if_supported("-Werror=unused" WERROR_USED) diff --git a/aie_runtime_lib/AIE/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE/aiesim/CMakeLists.txt index b62f624bad..ac39f4e417 100644 --- a/aie_runtime_lib/AIE/aiesim/CMakeLists.txt +++ b/aie_runtime_lib/AIE/aiesim/CMakeLists.txt @@ -7,7 +7,6 @@ set(INSTALLS - Makefile genwrapper_for_ps.cpp) diff --git a/aie_runtime_lib/AIE/aiesim/Makefile b/aie_runtime_lib/AIE/aiesim/Makefile deleted file mode 100644 index 722af0b6b6..0000000000 --- a/aie_runtime_lib/AIE/aiesim/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. -# # SPDX-License-Identifier: MIT - -# -# From one diretory up, you can invoke the simulator by typing -# > make -C sim -# -# If you want to change the host source file, you can redefine host: -# > make -C sim host=../yourhost.cpp -# -# Note: The host file location is relative to the folder or can -# be an absolute path -# - -MLIR_AIE_INSTALL = $(dir $(shell which aie-opt))/.. - -ifeq ($(host),) -host:=../../test.cpp -endif -ifeq ($(MLIR_AIE_SRC_DIR),) -MLIR_AIE_SRC_DIR:=. -endif -ifeq ($(XILINX_VITIS_AIETOOLS),) -XILINX_VITIS_AIETOOLS = $(dir $(shell which aiesimulator))/.. -endif - -MLIR_AIE_PROJ = $(notdir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))))) - -.PHONY: all link sim clean -.NOTPARALLEL: -all: sim - -CC_ENV := (export LD_LIBRARY_PATH=${XILINX_VITIS_AIETOOLS}/lib/lnx64.o:$(LD_LIBRARY_PATH)) -CC := "${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++" -CC_ARGS := -fPIC -fpermissive -c -std=c++17 -D__AIEARCH__=10 -DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR -Wno-deprecated-declarations -DSC_INCLUDE_DYNAMIC_PROCESSES -D__AIESIM__ -D__PS_INIT_AIE__ -DXAIE_DEBUG -Og -flto -D main\(...\)=ps_main\(...\) -I${XILINX_VITIS_AIETOOLS}/include -I${XILINX_VITIS_AIETOOLS}/include/drivers/aiengine -I${XILINX_HLS}/include -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0 -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/backward -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/x86_64-pc-linux-gnu -I${XILINX_VITIS_AIETOOLS}/data/osci_systemc/include -I. -I$(MLIR_AIE_SRC_DIR) -I${XILINX_VITIS_AIETOOLS}/include/xtlm/include -I${XILINX_VITIS_AIETOOLS}/include/common_cpp/common_cpp_v1_0/include -I${MLIR_AIE_INSTALL}/runtime_lib/x86_64/test_lib/include -I../../ -I../ - -ps/test.o: $(host) - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/test_library.o: ${MLIR_AIE_INSTALL}/../runtime_lib/test_lib/test_library.cpp - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/genwrapper_for_ps.o: ps/genwrapper_for_ps.cpp - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/ps.so: ps/genwrapper_for_ps.o ps/test.o ps/test_library.o $(eval PATH:=$(XILINX_VITIS_AIETOOLS)/tps/lnx64/gcc/bin/:$(PATH)) - (${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++ -o "ps/ps.so" ps/genwrapper_for_ps.o ps/test.o ps/test_library.o -Wl,--as-needed -shared -lxaiengine -lxioutils -ladf_api -lsystemc -lxtlm -flto -L ${XILINX_VITIS_AIETOOLS}/lib/lnx64.o -L${XILINX_VITIS_AIETOOLS}/data/osci_systemc/lib/lnx64) - -link: ps/ps.so - -sim: ps/ps.so - cd ../..; aiesimulator --pkg-dir=./$(MLIR_AIE_PROJ)/sim --dump-vcd foo - -clean: - (rm -rf ps/*.o ps/*.so *.log aiesimulator_output *vcd) diff --git a/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt index f47215010e..977fdccc7c 100644 --- a/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt +++ b/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt @@ -7,7 +7,6 @@ set(INSTALLS - Makefile genwrapper_for_ps.cpp) diff --git a/aie_runtime_lib/AIE2/aiesim/Makefile b/aie_runtime_lib/AIE2/aiesim/Makefile deleted file mode 100644 index 1e7c476b7a..0000000000 --- a/aie_runtime_lib/AIE2/aiesim/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. -# # SPDX-License-Identifier: MIT - -# -# From one diretory up, you can invoke the simulator by typing -# > make -C sim -# -# If you want to change the host source file, you can redefine host: -# > make -C sim host=../yourhost.cpp -# -# Note: The host file location is relative to the folder or can -# be an absolute path -# - -MLIR_AIE_INSTALL = $(dir $(shell which aie-opt))/.. - -ifeq ($(host),) -host:=../../test.cpp -endif -ifeq ($(MLIR_AIE_SRC_DIR),) -MLIR_AIE_SRC_DIR:=. -endif -ifeq ($(XILINX_VITIS_AIETOOLS),) -XILINX_VITIS_AIETOOLS = $(dir $(shell which aiesimulator))/.. -endif - -MLIR_AIE_PROJ = $(notdir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))))) - -.PHONY: all link sim clean -.NOTPARALLEL: -all: sim - -CC_ENV := (export LD_LIBRARY_PATH=${XILINX_VITIS_AIETOOLS}/lib/lnx64.o:$(LD_LIBRARY_PATH)) -CC := "${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++" -CC_ARGS := -fPIC -fpermissive -c -std=c++17 -D__AIEARCH__=20 -DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR -DAIE2_FP32_EMULATION_ACCURACY_FAST -Wno-deprecated-declarations -DSC_INCLUDE_DYNAMIC_PROCESSES -D__AIESIM__ -D__PS_INIT_AIE__ -DXAIE_DEBUG -Og -flto -D main\(...\)=ps_main\(...\) -I${XILINX_VITIS_AIETOOLS}/include -I${XILINX_VITIS_AIETOOLS}/include/drivers/aiengine -I${XILINX_HLS}/include -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0 -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/backward -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/x86_64-pc-linux-gnu -I${XILINX_VITIS_AIETOOLS}/data/osci_systemc/include -I. -I$(MLIR_AIE_SRC_DIR) -I${XILINX_VITIS_AIETOOLS}/include/xtlm/include -I${XILINX_VITIS_AIETOOLS}/include/common_cpp/common_cpp_v1_0/include -I${MLIR_AIE_INSTALL}/runtime_lib/x86_64/test_lib/include -I../../ -I../ - -ps/test.o: $(host) - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/test_library.o: ${MLIR_AIE_INSTALL}/../runtime_lib/test_lib/test_library.cpp - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/genwrapper_for_ps.o: ps/genwrapper_for_ps.cpp - $(CC_ENV);$(CC) $(CC_ARGS) -o $@ $< - -ps/ps.so: ps/genwrapper_for_ps.o ps/test.o ps/test_library.o $(eval PATH:=$(XILINX_VITIS_AIETOOLS)/tps/lnx64/gcc/bin/:$(PATH)) - (${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++ -o "ps/ps.so" ps/genwrapper_for_ps.o ps/test.o ps/test_library.o -Wl,--as-needed -shared -lxaiengine -lxioutils -ladf_api -lsystemc -lxtlm -flto -L ${XILINX_VITIS_AIETOOLS}/lib/lnx64.o -L${XILINX_VITIS_AIETOOLS}/data/osci_systemc/lib/lnx64) - -link: ps/ps.so - -sim: ps/ps.so - cd ../..; aiesimulator --pkg-dir=./$(MLIR_AIE_PROJ)/sim --dump-vcd foo - -clean: - (rm -rf ps/*.o ps/*.so *.log aiesimulator_output *vcd) diff --git a/cmake/toolchainFiles/toolchain_x86_64.cmake b/cmake/toolchainFiles/toolchain_x86_64.cmake index 6961a860f6..b0fe79758f 100644 --- a/cmake/toolchainFiles/toolchain_x86_64.cmake +++ b/cmake/toolchainFiles/toolchain_x86_64.cmake @@ -1,4 +1,7 @@ -# Copyright (C) 2018-2022, Xilinx Inc. All rights reserved. -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. -# SPDX-License-Identifier: MIT +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2018-2024 Advanced Micro Devices, Inc. or its affiliates diff --git a/docs/Building.md b/docs/Building.md index c32ffc09e8..07b237eb09 100644 --- a/docs/Building.md +++ b/docs/Building.md @@ -118,6 +118,11 @@ and llvm. source utils/env_setup.sh /install /install ``` +Note that when coming back to this install with a fresh environment, it is necessary to rerun the `utils/env_setup.sh` script to setup your environment as well as activate the Python virtual environment using the following command. +``` +source sandbox/bin/activate +``` + ## Building on X86 targetting the VCK5000 In order to build and run on PCIe cards, you first have to build and install the aie-rt library. We chose to install the library in /opt/xaiengine but it is not required for the tools to be installed there. Just ensure that when building mlir-aie and mlir-air, that you point to the directory in which the aie-rt library was installed. @@ -147,6 +152,8 @@ Then, set `${ROCM_ROOT}` to the ROCm install from the previous path. Then, run t The PCIe AIR runtime requires the use of the [AIR PCIe kernel driver](https://github.com/Xilinx/ROCm-air-platforms/tree/main/driver). The driver directory in the [ROCm-air-platforms](https://github.com/Xilinx/ROCm-air-platforms) repository contains documentation on how to compile and load the AIR PCIe kernel driver. +After this is complete, refer back to Step 5 of `Building on X86 for mlir-aie development` to setup the rest of your environment. + ### Sysroot Since the AIE tools are cross-compiling, in order to actually compile code, we need a 'sysroot' directory, containing an ARM rootfs. This rootfs must match what will be available in the runtime environment. diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 806275a8e9..00a9a89137 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -13,7 +13,7 @@ #include "aie/Dialect/AIE/IR/AIEDialect.h" #include "aie/Dialect/AIE/Transforms/AIEPasses.h" -#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Analysis/TopologicalSortUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp index 42cf201824..0ab77bcde7 100644 --- a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp +++ b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023, Advanced Micro Devices, Inc. +// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. // //===----------------------------------------------------------------------===// // This file contains conversions and rewrites to the Vector dialect to make @@ -39,6 +39,55 @@ using namespace xilinx::aievec; //================== Common AIE canonicalization analysis ====================// //============================================================================// +static bool isGemmBTransposedContractionOp(vector::ContractionOp op) { + if (op.getKind() != vector::CombiningKind::ADD) + return false; + + // Get and check shape of operands + auto lhsShape = op.getLhsType().getShape(); + auto rhsShape = op.getRhsType().getShape(); + auto accShape = cast(op.getAccType()).getShape(); + if (lhsShape.size() < 2 || rhsShape.size() < 2 || accShape.size() < 2) + return false; + + // Check that the innermost iterators match gemm-like iterators + SmallVector iterators = op.getIteratorTypesArray(); + if (iterators.size() < 3) + return false; + auto innerMostIterators = + SmallVector(iterators.end() - 3, iterators.end()); + if (vector::IteratorType::parallel != innerMostIterators[0] || + vector::IteratorType::parallel != innerMostIterators[1] || + vector::IteratorType::reduction != innerMostIterators[2]) + return false; + + // Get indexing maps of iterators for operands + SmallVector indexingMaps(op.getIndexingMapsArray()); + SmallVector outerMostResults; + for (int64_t i = 0; i < indexingMaps[0].getNumResults() - 2; i++) + outerMostResults.push_back(i); + + auto innerLhsMap = indexingMaps[0].dropResults(outerMostResults); + auto innerRhsMap = indexingMaps[1].dropResults(outerMostResults); + auto innerAccMap = indexingMaps[2].dropResults(outerMostResults); + + // Check whether they conform to a "transposed B" gemm + auto ctx = op.getContext(); + auto mmAidxMap = + AffineMap::getPermutationMap(ArrayRef{1, 0, 2}, ctx) + .dropResults(0); + auto mmBidxMap = + AffineMap::getPermutationMap(ArrayRef{0, 1, 2}, ctx) + .dropResults(0); + auto mmCidxMap = + AffineMap::getPermutationMap(ArrayRef{2, 0, 1}, ctx) + .dropResults(0); + int64_t numOuterMostDims = indexingMaps[0].getNumDims() - 3; + return innerLhsMap == mmAidxMap.shiftDims(numOuterMostDims) && + innerRhsMap == mmBidxMap.shiftDims(numOuterMostDims) && + innerAccMap == mmCidxMap.shiftDims(numOuterMostDims); +} + //============================================================================// //============ Common AIE canonicalization conversion patterns ===============// //============================================================================// @@ -411,6 +460,107 @@ struct FlattenMultDimTransferWritePattern } }; +// This pattern takes out an implicit transposition of the `rhs` operand in a +// gemm-like contraction op, making it an explicit `vector.transpose` op. +// If `rhs` is coming from a widening op (`extf`/`extsi`/`extui`), the +// transposition will be hoisted above the widening op. +struct ExtractTransposeFromContractionOp + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + static VectorType getTransposedVectorType(VectorType vecTy) { + SmallVector shape{vecTy.getShape()}; + auto nDim = shape.size(); + int64_t dimNm1 = shape[nDim - 1]; + shape[nDim - 1] = shape[nDim - 2]; + shape[nDim - 2] = dimNm1; + auto elemTy = vecTy.getElementType(); + return VectorType::get(shape, elemTy); + } + + LogicalResult + matchAndRewrite(vector::ContractionOp contractOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!isGemmBTransposedContractionOp(contractOp)) + return failure(); + + Location loc = contractOp.getLoc(); + auto ctx = rewriter.getContext(); + + Value rhsVal = adaptor.getRhs(); + VectorType rhsVecTy = contractOp.getRhsType(); + Type rhsElemTy = rhsVecTy.getElementType(); + + bool doExtF = false, doExtSI = false, doExtUI = false; + if (auto extfRhsOp = rhsVal.getDefiningOp()) { + rhsVal = extfRhsOp.getIn(); + rhsVecTy = cast(rhsVal.getType()); + doExtF = true; + } else if (auto extsiRhsOp = rhsVal.getDefiningOp()) { + rhsVal = extsiRhsOp.getIn(); + rhsVecTy = cast(rhsVal.getType()); + doExtSI = true; + } else if (auto extuiRhsOp = rhsVal.getDefiningOp()) { + rhsVal = extuiRhsOp.getIn(); + rhsVecTy = cast(rhsVal.getType()); + doExtUI = true; + } + + int64_t nDim = rhsVecTy.getShape().size(); + SmallVector rhsPermutation; + for (int64_t i = 0; i < nDim - 2; i++) + rhsPermutation.push_back(i); + rhsPermutation.push_back(nDim - 1); + rhsPermutation.push_back(nDim - 2); + auto transpRhsVecTy = getTransposedVectorType(rhsVecTy); + rhsVal = rewriter + .create(loc, transpRhsVecTy, rhsVal, + rhsPermutation) + .getResult(); + + if (doExtF) + rhsVal = + rewriter + .create( + loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy), + rhsVal) + .getOut(); + if (doExtSI) + rhsVal = + rewriter + .create( + loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy), + rhsVal) + .getOut(); + if (doExtUI) + rhsVal = + rewriter + .create( + loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy), + rhsVal) + .getOut(); + + SmallVector oldIdxMaps(contractOp.getIndexingMapsArray()); + + nDim = oldIdxMaps[1].getNumDims(); + SmallVector innerDimPerm; + for (int64_t i = 0; i < nDim - 2; i++) + innerDimPerm.push_back(i); + innerDimPerm.push_back(nDim - 1); + innerDimPerm.push_back(nDim - 2); + auto transpPermMap = AffineMap::getPermutationMap(innerDimPerm, ctx); + + auto newIdxMaps = rewriter.getAffineMapArrayAttr( + {oldIdxMaps[0], oldIdxMaps[1].compose(transpPermMap), oldIdxMaps[2]}); + + rewriter.replaceOpWithNewOp( + contractOp, contractOp.getResult().getType(), adaptor.getLhs(), rhsVal, + adaptor.getAcc(), newIdxMaps, contractOp.getIteratorTypes()); + + return success(); + } +}; + //============================================================================// //============ AIEML canonicalization conversion patterns ===============// //============================================================================// @@ -470,6 +620,10 @@ static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target, [](vector::TransferWriteOp op) { return cast(op.getVector().getType()).getRank() < 2; }); + target.addDynamicallyLegalOp( + [](vector::ContractionOp op) { + return !isGemmBTransposedContractionOp(op); + }); } static void @@ -477,8 +631,9 @@ populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns, TargetBackend backend) { patterns.add(patterns.getContext(), 1024, 256); - patterns.add(patterns.getContext()); + patterns + .add(patterns.getContext()); } //============================================================================// diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir index 103cbbbcbe..236fe53140 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir +++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { -aie.device(npu) { +aie.device(npu1_3col) { //shim %tile00 = aie.tile(0, 0) diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 79ee207026..94f5888512 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -580,7 +580,7 @@ def core_body(): @core(cores[i][1], "conv2dk3.o") def core_body(): - scale = 11 + scale = 1 for _ in for_(sys.maxsize): # acquire weights and rtps once @@ -697,7 +697,7 @@ def core_body(): @core(cores[i][3], "conv2dk3.o") def core_body(): - scale = 11 + scale = 1 for _ in for_(sys.maxsize): # acquire weights and rtps once @@ -927,33 +927,24 @@ def core_body(): ) def sequence(inputFromL3, weightsFromL3, outputToL3): - # for c, col in enumerate(rtp_name): - # for r, row in enumerate(col): - # NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale - - # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) - # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1) - - # NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) - - # NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) - - # # # write RTP parameters - # npuWriteRTPOp( - # "rtpComputeTile02", col=0, row=2, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile03", col=0, row=3, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile05", col=0, row=5, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile04", col=0, row=4, index=0, value=1 - # ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input - # npuWriteRTPOp( - # "rtpComputeTile04", col=0, row=4, index=1, value=0 - # ) # skip_scale + NpuWriteRTPOp("rtpComputeTile02", col=0, row=2, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile03", col=0, row=3, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile04", col=0, row=5, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1) + + NpuWriteRTPOp("rtpComputeTile15", col=1, row=5, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile14", col=1, row=4, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) + + NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=1) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) npu_dma_memcpy_nd( metadata="act1_00_02_01", diff --git a/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit b/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit index c6bf5b4886..6097345491 100644 --- a/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit +++ b/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit @@ -1,8 +1,7 @@ // (c) Copyright 2024 Advanced Micro Devices, Inc. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// This test is disabled due to random failures on gihub CI -// REQUIRES: ryzen_ai, chess, torch, has_random_failures +// REQUIRES: ryzen_ai, chess, torch // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index cbfb5edcd3..4473cb54ca 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -239,6 +239,12 @@ def parse_args(args=None): const=True, help="Generate xclbin", ) + parser.add_argument( + "--xclbin-input", + dest="xclbin_input", + default=None, + help="Generate kernel into existing xclbin file", + ) parser.add_argument( "--link_against_hsa", dest="link_against_hsa", diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index 62dd87190b..be72ee3363 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -21,6 +21,7 @@ import tempfile from textwrap import dedent import time +import uuid from aie.extras.runtime.passes import Pipeline @@ -207,7 +208,8 @@ def emit_partition(mlir_module_str, kernel_id="0x901", start_columns=None): else: start_columns = list(range(1, 6 - num_cols)) - uuid = random.randint(2222, 9999) + # Generate a uuid + pdi_uuid = uuid.uuid4() return { "aie_partition": { "name": "QoS", @@ -220,7 +222,7 @@ def emit_partition(mlir_module_str, kernel_id="0x901", start_columns=None): }, "PDIs": [ { - "uuid": "00000000-0000-0000-0000-00000000" + str(uuid), + "uuid": str(pdi_uuid), "file_name": "./design.pdi", "cdo_groups": [ { @@ -589,7 +591,25 @@ async def process_xclbin_gen(self): # fmt: off await self.do_call(task, ["bootgen", "-arch", "versal", "-image", self.prepend_tmp("design.bif"), "-o", self.prepend_tmp("design.pdi"), "-w"]) - await self.do_call(task, ["xclbinutil", "--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json"), "--add-kernel", self.prepend_tmp("kernels.json"), "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), "--force", "--output", opts.xclbin_name]) + if opts.xclbin_input: + await self.do_call(task, ["xclbinutil", + "--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"), + "--force", "--input", opts.xclbin_input]) + with open(self.prepend_tmp("aie_input_partition.json")) as f: + input_partition = json.load(f) + with open(self.prepend_tmp("aie_partition.json")) as f: + new_partition = json.load(f) + input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0]) + with open(self.prepend_tmp("aie_partition.json"), "w") as f: + json.dump(input_partition, f, indent=2) + flag = ['--input', opts.xclbin_input] + else: + flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json")] + + await self.do_call(task, ["xclbinutil"] + flag + + ["--add-kernel", self.prepend_tmp("kernels.json"), + "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), + "--force", "--output", opts.xclbin_name]) # fmt: on async def process_host_cgen(self, aie_target, file_with_addresses): @@ -801,7 +821,6 @@ def make_sim_dir(x): "test_lib", "include", ) - sim_makefile = os.path.join(runtime_simlib_path, "Makefile") sim_genwrapper = os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp") file_physical = self.prepend_tmp("input_physical.mlir") memory_allocator = os.path.join( @@ -893,8 +912,6 @@ def make_sim_dir(x): ], ) ) - processes.append(self.do_call(task, ["cp", sim_makefile, sim_dir])) - processes.append(self.do_call(task, ["cp", sim_genwrapper, sim_ps_dir])) processes.append( self.do_call( task, @@ -905,7 +922,7 @@ def make_sim_dir(x): "-shared", "-o", os.path.join(sim_ps_dir, "ps.so"), - os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp"), + sim_genwrapper, *aie_target_defines(aie_target), *host_opts, *sim_cc_args, diff --git a/test/dialect/AIEVec/precanonicalization-aieml.mlir b/test/dialect/AIEVec/precanonicalization-aieml.mlir index e4e1004a0c..14b557b49b 100644 --- a/test/dialect/AIEVec/precanonicalization-aieml.mlir +++ b/test/dialect/AIEVec/precanonicalization-aieml.mlir @@ -85,3 +85,84 @@ func.func @multidim_vector_transfer(%in : memref<64x64x32x8xbf16>, return } +// +// ----- +// + +// CHECK: #[[IDXMAPA:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> +// CHECK: #[[IDXMAPB:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> +// CHECK: #[[IDXMAPC:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> + +// CHECK-LABEL: func.func @vector_contract_permuted_b( +// CHECK-SAME: %[[VA:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>, +// CHECK-SAME: %[[VB:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>, +// CHECK-SAME: %[[VC:[a-zA-Z0-9]+]]: vector<1x1x4x4xf32> +func.func @vector_contract_permuted_b(%A : vector<1x1x4x8xbf16>, + %B : vector<1x1x4x8xbf16>, + %C : vector<1x1x4x4xf32>) + -> vector<1x1x4x4xf32> { + // CHECK: %[[TRB:.*]] = vector.transpose %[[VB]], [0, 1, 3, 2] : + // CHECK-SAME: vector<1x1x4x8xbf16> to vector<1x1x8x4xbf16> + // CHECK: %[[RES:.*]] = vector.contract { + // CHECK-SAME: indexing_maps = [#[[IDXMAPA]], #[[IDXMAPB]], #[[IDXMAPC]]], + // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", + // CHECK-SAME: "parallel", "parallel", "reduction"], + // CHECK-SAME: kind = #vector.kind} + // CHECK-SAME: %[[VA]], %[[TRB]], %[[VC]] : + // CHECK-SAME: vector<1x1x4x8xbf16>, vector<1x1x8x4xbf16> + // CHECK-SAME: into vector<1x1x4x4xf32> + %res = vector.contract { + indexing_maps = [#map1, #map2, #map3], + iterator_types = ["parallel", "parallel", "reduction", + "parallel", "parallel", "reduction"], + kind = #vector.kind} %A, %B, %C : + vector<1x1x4x8xbf16>, vector<1x1x4x8xbf16> into vector<1x1x4x4xf32> + return %res : vector<1x1x4x4xf32> +} + +// +// ----- +// + +// CHECK: #[[IDXMAPA:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> +// CHECK: #[[IDXMAPB:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> +// CHECK: #[[IDXMAPC:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> + +// CHECK-LABEL: func.func @vector_contract_permuted_b( +// CHECK-SAME: %[[VA:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>, +// CHECK-SAME: %[[VB:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>, +// CHECK-SAME: %[[VC:[a-zA-Z0-9]+]]: vector<1x1x4x4xf32> +func.func @vector_contract_permuted_b(%A : vector<1x1x4x8xbf16>, + %B : vector<1x1x4x8xbf16>, + %C : vector<1x1x4x4xf32>) + -> vector<1x1x4x4xf32> { + // CHECK: %[[LHS:.*]] = arith.extf %[[VA]] : + // CHECK-SAME: vector<1x1x4x8xbf16> to vector<1x1x4x8xf32> + // CHECK: %[[TRB:.*]] = vector.transpose %[[VB]], [0, 1, 3, 2] : + // CHECK-SAME: vector<1x1x4x8xbf16> to vector<1x1x8x4xbf16> + // CHECK: %[[RHS:.*]] = arith.extf %[[TRB]] : + // CHECK-SAME: vector<1x1x8x4xbf16> to vector<1x1x8x4xf32> + // CHECK: %[[RES:.*]] = vector.contract { + // CHECK-SAME: indexing_maps = [#[[IDXMAPA]], #[[IDXMAPB]], #[[IDXMAPC]]], + // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", + // CHECK-SAME: "parallel", "parallel", "reduction"], + // CHECK-SAME: kind = #vector.kind} + // CHECK-SAME: %[[LHS]], %[[RHS]], %[[VC]] : + // CHECK-SAME: vector<1x1x4x8xf32>, vector<1x1x8x4xf32> + // CHECK-SAME: into vector<1x1x4x4xf32> + %lhs = arith.extf %A : vector<1x1x4x8xbf16> to vector<1x1x4x8xf32> + %rhs = arith.extf %B : vector<1x1x4x8xbf16> to vector<1x1x4x8xf32> + %res = vector.contract { + indexing_maps = [#map1, #map2, #map3], + iterator_types = ["parallel", "parallel", "reduction", + "parallel", "parallel", "reduction"], + kind = #vector.kind} %lhs, %rhs, %C : + vector<1x1x4x8xf32>, vector<1x1x4x8xf32> into vector<1x1x4x4xf32> + return %res : vector<1x1x4x4xf32> +} \ No newline at end of file diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir new file mode 100644 index 0000000000..676dda4305 --- /dev/null +++ b/test/npu-xrt/add_one_two/aie1.mlir @@ -0,0 +1,53 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %t00 = aie.tile(0, 0) + %t01 = aie.tile(0, 1) + %t02 = aie.tile(0, 2) + + aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] () + + aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] () + + aie.core(%t02) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_32 = arith.constant 1 : i32 + + scf.for %steps = %c0 to %c8 step %c1 { + %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<8xi32> + %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview> + %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8xi32> + scf.for %arg3 = %c0 to %c8 step %c1 { + %0 = memref.load %elem0[%arg3] : memref<8xi32> + %1 = arith.addi %0, %c1_32 : i32 + memref.store %1, %elem1[%arg3] : memref<8xi32> + } + aie.objectfifo.release @objFifo_in1(Consume, 1) + aie.objectfifo.release @objFifo_out1(Produce, 1) + } + aie.end + } + func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + return + } + } +} diff --git a/test/npu-xrt/add_one_two/aie2.mlir b/test/npu-xrt/add_one_two/aie2.mlir new file mode 100644 index 0000000000..75f1f9beb3 --- /dev/null +++ b/test/npu-xrt/add_one_two/aie2.mlir @@ -0,0 +1,53 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %t00 = aie.tile(0, 0) + %t01 = aie.tile(0, 1) + %t02 = aie.tile(0, 2) + + aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] () + + aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] () + + aie.core(%t02) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2_32 = arith.constant 2 : i32 + + scf.for %steps = %c0 to %c8 step %c1 { + %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<8xi32> + %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview> + %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8xi32> + scf.for %arg3 = %c0 to %c8 step %c1 { + %0 = memref.load %elem0[%arg3] : memref<8xi32> + %1 = arith.addi %0, %c2_32 : i32 + memref.store %1, %elem1[%arg3] : memref<8xi32> + } + aie.objectfifo.release @objFifo_in1(Consume, 1) + aie.objectfifo.release @objFifo_out1(Produce, 1) + } + aie.end + } + func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + return + } + } +} diff --git a/test/npu-xrt/add_one_two/run.lit b/test/npu-xrt/add_one_two/run.lit new file mode 100644 index 0000000000..60fb9dbf43 --- /dev/null +++ b/test/npu-xrt/add_one_two/run.lit @@ -0,0 +1,11 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=insts.txt %S/aie1.mlir +// RUN: %python aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-input=add_one.xclbin --xclbin-name=add_two.xclbin --npu-insts-name=insts.txt %S/aie2.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x add_two.xclbin -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/test/npu-xrt/add_one_two/test.cpp b/test/npu-xrt/add_one_two/test.cpp new file mode 100644 index 0000000000..0f2a5c3c7e --- /dev/null +++ b/test/npu-xrt/add_one_two/test.cpp @@ -0,0 +1,221 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 64; +constexpr int OUT_SIZE = 64; + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")("verbosity,v", + po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(), + [](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name == "ADDONE"; + }); + auto kernelName0 = xkernel0.get_name(); + auto xkernel1 = *std::find_if(xkernels.begin(), xkernels.end(), + [](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name == "ADDTWO"; + }); + auto kernelName1 = xkernel1.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernels: " << kernelName0 << " and " + << kernelName1 << "\n"; + + auto kernel0 = xrt::kernel(context, kernelName0); + + auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0)); + auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2)); + auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3)); + auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4)); + + auto kernel1 = xrt::kernel(context, kernelName1); + + auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0)); + auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2)); + auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3)); + auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + uint32_t *bufInA = bo0_inA.map(); + std::vector srcVecA; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(i + 1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo0_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo0_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo0_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel 0.\n"; + + auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out); + run0.wait(); + + bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + uint32_t *bufOut = bo0_out.map(); + + // same instructions as kernel1 + bufInstr = bo1_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + bo1_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // copy kernel0 output to kernel1 input + bufInA = bo1_inA.map(); + memcpy(bufInA, bufOut, IN_SIZE * sizeof(uint32_t)); + bo1_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel 1.\n"; + auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out); + run1.wait(); + + bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufOut = bo1_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < 64; i++) { + uint32_t ref = (i + 1) + 1 + 2; + if (*(bufOut + i) != ref) { + std::cout << "Error in output " << *(bufOut + i) << " != " << ref + << std::endl; + errors++; + } else { + std::cout << "Correct output " << *(bufOut + i) << " == " << ref + << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed.\n\n"; + return 1; + } +} diff --git a/tools/aie2xclbin/CMakeLists.txt b/tools/aie2xclbin/CMakeLists.txt index b6056b18c9..cd8afac255 100644 --- a/tools/aie2xclbin/CMakeLists.txt +++ b/tools/aie2xclbin/CMakeLists.txt @@ -21,6 +21,12 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) configure_file(configure.h.in configure.h) target_include_directories(aie2xclbin PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +if(MSVC) + set(UUID "Rpcrt4.lib") +else() + find_library (UUID uuid REQUIRED) +endif() + target_link_libraries(aie2xclbin ${dialect_libs} MLIRParser @@ -37,7 +43,8 @@ target_link_libraries(aie2xclbin AIEX AIEXTransforms MLIRAIEVecDialect - MLIRXLLVMDialect) + MLIRXLLVMDialect + ${UUID}) install(TARGETS aie2xclbin EXPORT AIE2XCLBIN diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp index a45ea8bd07..a75e06662e 100644 --- a/tools/aie2xclbin/XCLBinGen.cpp +++ b/tools/aie2xclbin/XCLBinGen.cpp @@ -45,7 +45,13 @@ #include #ifdef _WIN32 +#include "windows.h" +// For UUID stuff +#include "rpcdce.h" + #define setenv(name, var, ignore) _putenv_s(name, var) +#else +#include #endif using namespace llvm; @@ -126,6 +132,31 @@ void xilinx::findVitis(XCLBinGenConfig &TK) { } } +static std::string getUUIDString() { + std::string val; +#ifdef _WIN32 + UUID *uuid; + RPC_STATUS status; + status = UuidCreate(uuid); + if (status != RPC_S_OK) + errs() << "Failed to create UUID\n"; + RPC_CSTR *uuidstring; + status = UuidToStringA(uuid, uuidstring); + if (status != RPC_S_OK) + errs() << "Failed to convert UUID to string\n"; + val = std::string((char *)uuidstring); + status = RpcStringFreeA(uuidstring); + if (status != RPC_S_OK) + errs() << "Failed to free UUID string\n"; +#else + uuid_t binuuid; + uuid_generate_random(binuuid); + char uuid[37]; + uuid_unparse_lower(binuuid, uuid); + val = std::string(uuid); +#endif + return val; +} static void addAIELoweringPasses(OpPassManager &pm) { pm.addPass(createLowerAffinePass()); pm.addPass(AIE::createAIECanonicalizeDevicePass()); @@ -453,6 +484,7 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, if (!aiePartitionJsonOut) return moduleOp.emitOpError(errorMessage); + std::string uuid_str = getUUIDString(); std::string aie_partition_json_data = R"( { "aie_partition": { @@ -468,7 +500,7 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, }, "PDIs": [ { - "uuid": "00000000-0000-0000-0000-000000008025", + "uuid": ")" + uuid_str + R"(", "file_name": "./design.pdi", "cdo_groups": [ { diff --git a/utils/clone-llvm.sh b/utils/clone-llvm.sh index bfd3dae037..498b36a19a 100755 --- a/utils/clone-llvm.sh +++ b/utils/clone-llvm.sh @@ -13,8 +13,8 @@ ##===----------------------------------------------------------------------===## # The LLVM commit to use. -LLVM_PROJECT_COMMIT=8a71284cb9463a90fab0d9e8edbeb5d879531e32 -DATETIME=2024051512 +LLVM_PROJECT_COMMIT=25b65be43df56c1b7bea3fe2596fb36c2788d7af +DATETIME=2024052220 WHEEL_VERSION=19.0.0.$DATETIME+${LLVM_PROJECT_COMMIT:0:8} ############################################################################################