diff --git a/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt b/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt new file mode 100644 index 0000000000..570641cef5 --- /dev/null +++ b/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt @@ -0,0 +1,12 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2022 Xilinx Inc. + +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name AIEVecAnalysis) +add_public_tablegen_target(MLIRAIEVecAnalysisPassIncGen) + +add_mlir_doc(Passes AIEVecAnalysisPasses ./ -gen-pass-doc) diff --git a/include/aie/Dialect/AIEVec/Analysis/Passes.h b/include/aie/Dialect/AIEVec/Analysis/Passes.h new file mode 100644 index 0000000000..de1dc35cef --- /dev/null +++ b/include/aie/Dialect/AIEVec/Analysis/Passes.h @@ -0,0 +1,46 @@ +//===- Passes.h - AIE Vector Passes -----------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2022 Xilinx Inc. +// +//===----------------------------------------------------------------------===// +// Register all the AIE vectorization passes +//===----------------------------------------------------------------------===// + +#ifndef AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H +#define AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H + +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassOptions.h" +#include + +//===----------------------------------------------------------------------===// +// Registration +//===----------------------------------------------------------------------===// + +namespace mlir { +namespace func { +class FuncOp; +} // namespace func +} // namespace mlir + +namespace xilinx { +namespace aievec { + +#define GEN_PASS_DECL +#define GEN_PASS_CLASSES +#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc" + +std::unique_ptr createAIEVecConvolutionAnalysisPass(); + +/// Generate the code for registering passes. +#define GEN_PASS_REGISTRATION +#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc" + +} // end namespace aievec +} // end namespace xilinx + +#endif // AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H diff --git a/include/aie/Dialect/AIEVec/Analysis/Passes.td b/include/aie/Dialect/AIEVec/Analysis/Passes.td new file mode 100644 index 0000000000..078504df4e --- /dev/null +++ b/include/aie/Dialect/AIEVec/Analysis/Passes.td @@ -0,0 +1,28 @@ +//=== Passes.td - AIE vector analysis pass definition file -*- tablegen -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023 AMD Inc. +// +//===----------------------------------------------------------------------===// +// This file contains definitions for passes within the AIEVec/ directory. +//===----------------------------------------------------------------------===// + +#ifndef AIE_DIALECT_AIEVEC_ANALYSIS_PASSES +#define AIE_DIALECT_AIEVEC_ANALYSIS_PASSES + +include "mlir/Pass/PassBase.td" + +def AIEVecConvAnalysis : Pass<"aievec-convolution-analysis", "mlir::func::FuncOp"> { + let summary = "Find MAC chains that can be replaced by convolution ops in " + "AIE-ML"; + let constructor = "xilinx::aievec::createAIEVecConvolutionAnalysisPass()"; + let options = [ + Option<"printResult", "print", "bool", /*default=*/"false", + "Print the result of the analysis">, + ]; +} + +#endif // AIE_DIALECT_AIEVEC_ANALYSIS_PASSES diff --git a/include/aie/Dialect/AIEVec/CMakeLists.txt b/include/aie/Dialect/AIEVec/CMakeLists.txt index 1c3baf43bc..0603587912 100644 --- a/include/aie/Dialect/AIEVec/CMakeLists.txt +++ b/include/aie/Dialect/AIEVec/CMakeLists.txt @@ -5,5 +5,6 @@ # # (c) Copyright 2022 Xilinx Inc. +add_subdirectory(Analysis) add_subdirectory(IR) add_subdirectory(Transforms) diff --git a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp index 16b25df533..006ffaa9a2 100644 --- a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp +++ b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp @@ -1,4 +1,5 @@ #include "aie/Dialect/AIEVec/AIEVecUtils.h" +#include "aie/Dialect/AIEVec/Analysis/Passes.h" #include "aie/Dialect/AIEVec/IR/AIEVecOps.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -192,6 +193,10 @@ populateAIEVecV2TransformationPatterns(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } +//===----------------------------------------------------------------------===// +// Legalizations +//===----------------------------------------------------------------------===// + static void configureAIEVecV1TransformationLegalizations(ConversionTarget &target) { target.addLegalDialect(); @@ -202,9 +207,6 @@ configureAIEVecV1TransformationLegalizations(ConversionTarget &target) { }); } -//===----------------------------------------------------------------------===// -// Legalizations -//===----------------------------------------------------------------------===// static void configureAIEVecV2TransformationLegalizations(ConversionTarget &target) { target.addDynamicallyLegalOp( @@ -373,9 +375,11 @@ void xilinx::aievec::buildOptimizeAIEVec(OpPassManager &pm, pm.addPass(createCSEPass()); pm.addPass(createCanonicalizerPass()); - // TODO: This pass should only be included if the target is AIEML. // Add generating aievec convolution ops pass - pm.addPass(createAIEVecConvOpTransformationPass(options)); + if (options.aieTarget == "aieml") { + pm.addPass(createAIEVecConvolutionAnalysisPass()); + pm.addPass(createAIEVecConvOpTransformationPass(options)); + } // Add post-lowering canonicalization passes. pm.addPass(createCSEPass()); diff --git a/lib/Dialect/AIEVec/Transforms/CMakeLists.txt b/lib/Dialect/AIEVec/Transforms/CMakeLists.txt index 6a8fbc686b..88d414e96a 100644 --- a/lib/Dialect/AIEVec/Transforms/CMakeLists.txt +++ b/lib/Dialect/AIEVec/Transforms/CMakeLists.txt @@ -16,9 +16,11 @@ add_mlir_dialect_library(MLIRAIEVecTransforms ADDITIONAL_HEADER_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/aie/Dialect/AIEVec/Transforms + ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/aie/Dialect/AIEVec/Analysis DEPENDS MLIRAIEVecPassIncGen + MLIRAIEVecAnalysisPassIncGen LINK_LIBS PUBLIC MLIRIR diff --git a/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp b/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp index 34ef680bab..ba83c28822 100644 --- a/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp +++ b/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp @@ -45,6 +45,10 @@ using namespace xilinx::aievec; #define DEBUG_TYPE "vector-to-aievec-conversion" +//===----------------------------------------------------------------------===// +// Rewrite patterns +//===----------------------------------------------------------------------===// + template struct SetInboundsToReadStoreOpPattern : public RewritePattern { SetInboundsToReadStoreOpPattern(MLIRContext *context) @@ -73,6 +77,10 @@ struct SetInboundsToReadStoreOpPattern : public RewritePattern { using SetInboundsToReadOp = SetInboundsToReadStoreOpPattern; using SetInboundsToWriteOp = SetInboundsToReadStoreOpPattern; +//===----------------------------------------------------------------------===// +// Lowering passes +//===----------------------------------------------------------------------===// + struct RedundantLoadStoreOptimizationPass : public PassWrapper> { @@ -111,7 +119,6 @@ void xilinx::aievec::buildConvertVectorToAIEVec( // NOTE: This sub-pipeline ingests arbitrary MLIR Vector code. buildCanonicalizeVectorForAIEVec( pm, options.getCanonicalizeVectorForAIEVecOptions()); - // NOTE: At this stage, all the Vector code in the IR can be mapped // HOTE: to AIEVec operations. @@ -122,7 +129,6 @@ void xilinx::aievec::buildConvertVectorToAIEVec( // NOTE: This sub-pipeline ingests MLIR Vector code that can be mapped to // NOTE: AIEVec operations. buildLowerVectorToAIEVec(pm, options.getLowerVectorToAIEVecOptions()); - // NOTE: At this stage, all vector operations are expressed in AIEVec dialect. //============================================================================ diff --git a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp index 987fe6014c..77c1aaf30c 100644 --- a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp +++ b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp @@ -12,11 +12,14 @@ //===----------------------------------------------------------------------===// #include "aie/Dialect/AIEVec/AIEVecUtils.h" +#include "aie/Dialect/AIEVec/Analysis/Passes.h" #include "aie/Dialect/AIEVec/IR/AIEVecOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/AnalysisManager.h" #include "mlir/Transforms/DialectConversion.h" #include +#include #include "FoldMulAddChainToConvOp.h" @@ -26,367 +29,327 @@ using namespace vector; using namespace xilinx; using namespace xilinx::aievec; -typedef std::tuple MulDefTupleTy; -using MulDefTupleVecTy = SmallVector; -using MulDefMapTy = DenseMap; - -// If only one of the operands of given add is an add, return that operand's def -// op; otherwise return null. -arith::AddIOp getDefAddOp(arith::AddIOp addOp) { - auto defLhs = dyn_cast(addOp->getOperand(0).getDefiningOp()); - auto defRhs = dyn_cast(addOp->getOperand(1).getDefiningOp()); - if ((!defLhs && !defRhs) || (defLhs && defRhs)) { - return nullptr; - } - return defLhs ? defLhs : defRhs; -} - -// Return true if one of the operands of given mul op is a broadcast of a upd op -// and another operand of the mul op is a upd op. In this case, argument book -// keeps arguments. Otherwise, return false and leave book keeping unchanged. -bool checkChainPattern(arith::MulIOp mulOp, MulDefMapTy &macChainMap, - SmallVectorImpl &bcastOpSourceVec) { - aievec::BroadcastOp bcastOp = nullptr; - aievec::UPDOp updOp = nullptr; - - if (isa(mulOp.getOperand(0).getDefiningOp())) { - bcastOp = cast(mulOp->getOperand(0).getDefiningOp()); - if (!isa(mulOp->getOperand(1).getDefiningOp())) { - return false; - } - updOp = cast(mulOp->getOperand(1).getDefiningOp()); - } else if (isa(mulOp.getOperand(1).getDefiningOp())) { - bcastOp = cast(mulOp->getOperand(1).getDefiningOp()); - if (!isa(mulOp->getOperand(0).getDefiningOp())) { - return false; - } - updOp = cast(mulOp->getOperand(0).getDefiningOp()); - } else { - return false; - } +namespace xilinx::aievec { +#define GEN_PASS_DEF_AIEVECCONVANALYSIS +#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc" +} // namespace xilinx::aievec - if (!isa(bcastOp.getSource().getDefiningOp())) { - return false; - } - - if (!macChainMap.count(bcastOp.getSource())) { - bcastOpSourceVec.push_back(bcastOp.getSource()); - MulDefTupleVecTy tupleVec; - tupleVec.push_back(std::make_tuple(bcastOp.getIdx(), updOp, mulOp)); - macChainMap.insert(std::make_pair(bcastOp.getSource(), tupleVec)); - } else { - macChainMap[bcastOp.getSource()].push_back( - std::make_tuple(bcastOp.getIdx(), updOp, mulOp)); - } - return true; -} - -// The defs of mul ops consist of an upd op and a broadcast op. -// The chain map looks like below: -// | BroadcastOp source | vector> | -// The mul add op chain can be grouped by broadcast op's source. -// For each group, broadcastOp idx can be sorted to find the start of the -// memrefs used by broadcast op and upd op. -void buildChainMap(arith::AddIOp curAddOp, bool &hasMulConv, Value &acc, - MulDefMapTy &macChainMap, - SmallVectorImpl &bcastOpSourceVec) { - while (true) { - auto defLhs = - dyn_cast(curAddOp->getOperand(0).getDefiningOp()); - auto defRhs = - dyn_cast(curAddOp->getOperand(1).getDefiningOp()); - - if (!defLhs && !defRhs) { - break; - } - // If both ops of add op are mul ops, this will reach the top of the - // chain. Check the legality for both mul op and insert them to the chain - // map. - else if (defLhs && defRhs) { - if (!checkChainPattern(defLhs, macChainMap, bcastOpSourceVec) || - !checkChainPattern(defRhs, macChainMap, bcastOpSourceVec)) { - break; - } - hasMulConv = true; +/// This analysis builds the longest possible chain of MAC operations whose +/// operands are a vector that may or may not be shifted, and a broadcast. +/// That is, these MACs represent `vector x scalar` ops, and are candidates to +/// be grouped and replaced by mul_conv/fma_conv ops in AIE-ML. +// +// We build this chain recursively, climbing up the +struct LongestConvMACChainAnalysis { + static AnalysisManager *am; + + struct ConvMac { + // If there's a non-accumulating convolution upchain, + // store it here temorarily. + std::unique_ptr topOfChainMulConv; + // Accumulator value, if there is one. + Value acc; + // Left-hand side (non-broadcasting) source value + Value lhs; + // Left-hand side (broadcasting) source value + Value rhs; + // Amount that lhs is shifted + uint8_t shift; + // Element in rhs that is broadcasted + uint8_t bcastIdx; + ConvMac(Value lhs, Value rhs, uint8_t shift, uint8_t bcastIdx) + : topOfChainMulConv(nullptr), acc(nullptr), lhs(lhs), rhs(rhs), + shift(shift), bcastIdx(bcastIdx) {} + }; + + struct ConvMacChainGroup { + // Group start index within the chain + uint64_t fromIdx; + // Index in chain after group last MAC + uint64_t toIdx; + // Initial position of the signal to be convolved + int64_t signalShift; + // Initial position of the convolution filter + int64_t bcastShift; + // Distance between elements in the filter + int64_t bcastDist; // Must be 1 or 2 + }; + + typedef SmallVector, 8> ConvMacChain; + typedef SmallVector ConvMacChainGroupList; + + std::unique_ptr convMacChain; + ConvMacChainGroupList groupsInChain; + + /// Sort the chain of MACs by sources. When two MACs share the same sources, + /// sort them by the broadcast index. If they don't, sort them by the order + /// of the ops in the code. This function should be called after the chain + /// is completed, and before operating on the groups of MACs. After sorting, + /// MACs that can be fused into single convolution ops will be contiguous in + /// the chain. + void sortChain() { + if ((*convMacChain)[0]->acc) { + std::sort(convMacChain->begin(), convMacChain->end(), + [](const auto &a, const auto &b) { + if (a->lhs == b->lhs) { + if (a->rhs == b->rhs) + return a->bcastIdx < b->bcastIdx; + return a->rhs.getDefiningOp()->isBeforeInBlock( + b->rhs.getDefiningOp()); + } + // We should probably sort by lhs load address, if it exists + // XXX: We assume all MACs in the same block. If they're not, + // XXX: this will assert. + return a->lhs.getDefiningOp()->isBeforeInBlock( + b->lhs.getDefiningOp()); + }); } else { - arith::MulIOp curMulOp = defLhs ? defLhs : defRhs; - if (!checkChainPattern(curMulOp, macChainMap, bcastOpSourceVec)) { - break; - } - acc = defLhs ? curAddOp->getOperand(1) : curAddOp->getOperand(0); - } - - // Get the def add op the curOp operands - arith::AddIOp defAddOp = getDefAddOp(curAddOp); - - // The user/consumer user operation must be an add op, belonging to - // the same basic block as curOp. - if (!defAddOp || !defAddOp->hasOneUse() || - curAddOp->getBlock() != defAddOp->getBlock()) { - break; + // If the top of the chain is not an accumulation, bring up all related + // convolution MACs and sort the rest by lhs. + auto firstLhs = (*convMacChain)[0]->lhs; + std::sort(convMacChain->begin(), convMacChain->end(), + [&firstLhs](const auto &a, const auto &b) { + if (a->lhs == b->lhs) { + if (a->rhs == b->rhs) + return a->bcastIdx < b->bcastIdx; + return a->rhs.getDefiningOp()->isBeforeInBlock( + b->rhs.getDefiningOp()); + } + if (a->lhs == firstLhs) + return true; + if (b->lhs == firstLhs) + return false; + return a->lhs.getDefiningOp()->isBeforeInBlock( + b->lhs.getDefiningOp()); + }); + // Float the empty accumulator to the top. + if ((*convMacChain)[0]->acc) + for (auto &convMac : *convMacChain) + if (!convMac->acc) { + std::swap((*convMacChain)[0]->acc, convMac->acc); + break; + } } - curAddOp = defAddOp; } -} - -void refreshFusedGroups( - MulDefTupleTy defTuple, arith::MulIOp nextMulOp, - SmallVector &fusedOps, - SmallVectorImpl> &groupFusedOps, - int8_t &curIdx, aievec::UPDOp &curUpdOp, arith::MulIOp &curMulOp) { - groupFusedOps.push_back(fusedOps); - fusedOps.clear(); - fusedOps.push_back(nextMulOp); - std::tie(curIdx, curUpdOp, curMulOp) = defTuple; -} - -// Check whether mul add chain is valid for the transformation and classify the -// fused ops into different groups with valid constant memref distances. -bool collectFusedOps( - unsigned maxGroupSize, unsigned &dupFactor, - SmallVectorImpl &bcastOpSourceVec, - SmallVectorImpl> &groupFusedOps, - MulDefMapTy &macChainMap) { - int xDist = -1, zDist = -1; - for (auto item : bcastOpSourceVec) { - auto macChain = macChainMap[item]; - std::sort(macChain.begin(), macChain.end()); - int8_t curIdx = 0; - aievec::UPDOp curUpdOp = nullptr; - arith::MulIOp curMulOp = nullptr; - std::tie(curIdx, curUpdOp, curMulOp) = *macChain.begin(); - SmallVector dists; - SmallVector fusedOps; - fusedOps.push_back(curMulOp); - - for (auto it = std::next(macChain.begin()); it != macChain.end(); ++it) { - int8_t nextIdx = 0; - aievec::UPDOp nextUpdOp = nullptr; - arith::MulIOp nextMulOp = nullptr; - MulDefTupleTy defTuple = *it; - std::tie(nextIdx, nextUpdOp, nextMulOp) = defTuple; - - int32_t dist = nextIdx - curIdx; - - // Target AIE-ML intrinsic mac_conv_32x8 for v32int8 type and - // mac_conv_16x4 for v16int16 type. Thus, the distance of broadcast op - // source between two mul add ops cannot be larger than 32/8 or 16/4, - // which is 4. If dist is larger than 1, we need to shuffle the load to - // get the elements with the interval of dist. - if (dist > 4) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - dists.push_back(dist); - if (curUpdOp.getSource() != nextUpdOp.getSource()) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - MemRefType curMemRefType = - cast(curUpdOp.getSource().getType()); - MemRefType nextMemRefType = - cast(nextUpdOp.getSource().getType()); - - ArrayRef curSizes = curMemRefType.getShape(); - ArrayRef nextSizes = nextMemRefType.getShape(); - if (curSizes.size() != nextSizes.size()) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - AffineExpr curLinearAccess = - constructLinearizedAffineExprForUPDOp(curUpdOp); - AffineExpr nextLinearAccess = - constructLinearizedAffineExprForUPDOp(nextUpdOp); - if (!curLinearAccess || !nextLinearAccess) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - AffineExpr curBase, nextBase; - int32_t curOffset, nextOffset; - - // Get the base and offset from linear access expr - std::tie(curBase, curOffset) = extractBaseAndOffset(curLinearAccess); - std::tie(nextBase, nextOffset) = extractBaseAndOffset(nextLinearAccess); - if (curBase != nextBase) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - dist = nextOffset - curOffset; - if (dist != 1) { - if (fusedOps.size() < 2) { - return false; - } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - dists.push_back(dist); - if ((xDist != -1 && xDist != dists[0]) || - (zDist != -1 && zDist != dists[1])) { - if (fusedOps.size() < 2) { - return false; + // Return the list of convolution mac ops in the chain as pairs of indices + // indicating the position within the chain where a group starts and the + // position where it ends: [start, end). If they have not been precomputed + // yet, this method will generate them. + const ConvMacChainGroupList &getGroupsInChain() { + // If there's no group or it's been computed already, return stored list. + if (groupsInChain.size() > 0 || !convMacChain || convMacChain->size() == 0) + return groupsInChain; + + uint64_t grpStartIdx = 0; + uint64_t grpCurIdx = 0; + Value curLhs = (*convMacChain)[0]->lhs; + Value curRhs = (*convMacChain)[0]->rhs; + for (const auto &convMac : *convMacChain) { + if (grpCurIdx > grpStartIdx) { + if (curLhs != convMac->lhs || curRhs != convMac->rhs) { + groupsInChain.push_back({grpStartIdx, grpCurIdx, + getGroupSignalShift(grpStartIdx, grpCurIdx), + getGroupBcastShift(grpStartIdx, grpCurIdx), + getGroupBcastDist(grpStartIdx, grpCurIdx)}); + grpStartIdx = grpCurIdx; + curLhs = convMac->lhs; + curRhs = convMac->rhs; } - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; - } - - xDist = dists[0]; - zDist = dists[1]; - dupFactor = dists[0]; - - fusedOps.push_back(nextMulOp); - std::tie(curIdx, curUpdOp, curMulOp) = defTuple; - - if (fusedOps.size() > maxGroupSize) { - fusedOps.pop_back(); - refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx, - curUpdOp, curMulOp); - continue; } + grpCurIdx++; } - groupFusedOps.push_back(fusedOps); + if (grpStartIdx < grpCurIdx) + groupsInChain.push_back({grpStartIdx, grpCurIdx, + getGroupSignalShift(grpStartIdx, grpCurIdx), + getGroupBcastShift(grpStartIdx, grpCurIdx), + getGroupBcastDist(grpStartIdx, grpCurIdx)}); + return groupsInChain; } - return true; -} - -struct canFoldMulAddChainToConvOpAnalysis { - canFoldMulAddChainToConvOpAnalysis(arith::AddIOp addOp) { - if (!isa(addOp.getType())) { - canFoldMulAddChainToConvOp = false; - return; - } - VectorType resultType = cast(addOp.getResult().getType()); + // Return the signal shift for the group in the MAC chain in [fromIdx, toIdx) + // the top. This method verifies that the elements of the signal are + // contiguously accessed. If they do not, or the specified group doesn't + // exist, this function returns -1. + int64_t getGroupSignalShift(uint64_t fromIdx, uint64_t toIdx) { + if (fromIdx >= toIdx || toIdx > convMacChain->size()) + return -1; + if (toIdx == fromIdx + 1) + return static_cast((*convMacChain)[fromIdx]->shift); + for (uint64_t i = fromIdx; i < toIdx - 1; i++) + if ((static_cast((*convMacChain)[i + 1]->shift) - + static_cast((*convMacChain)[i]->shift)) != 1) + return -1; + return static_cast((*convMacChain)[fromIdx]->shift); + } - if (!resultType.getElementType().isa()) { - canFoldMulAddChainToConvOp = false; - return; - } + // Return the shift in value of the first broadcasted element in the i-th + // group. If there is no chain, or the i-th group does not exist, + // returns -1. + int64_t getGroupBcastShift(uint64_t fromIdx, uint64_t toIdx) { + if (fromIdx >= toIdx || toIdx > convMacChain->size()) + return -1; + return static_cast((*convMacChain)[fromIdx]->bcastIdx); + } - IntegerType resultElType = cast(resultType.getElementType()); - unsigned resultElWidth = resultElType.getWidth(); - unsigned laneSize = getVectorLaneSize(resultType); + // Returns the broadcast distance between elements within the group. If the + // distance is not constant and equal to 1 or 2, it returns -1. + int64_t getGroupBcastDist(uint64_t fromIdx, uint64_t toIdx) { + if (fromIdx >= toIdx || toIdx > convMacChain->size()) + return -1; + if (toIdx == fromIdx + 1) + return 1; + int64_t bcastDist = + static_cast((*convMacChain)[fromIdx + 1]->bcastIdx) - + static_cast((*convMacChain)[fromIdx]->bcastIdx); + if (bcastDist != 1 && bcastDist != 2) + return -1; + for (uint64_t i = fromIdx + 1; i < toIdx - 1; i++) + if ((static_cast((*convMacChain)[i + 1]->bcastIdx) - + static_cast((*convMacChain)[i]->bcastIdx)) != bcastDist) + return -1; + return bcastDist; + } - if ((laneSize != 32 || resultElWidth != 8) && - (laneSize != 16 || resultElWidth != 16)) { - canFoldMulAddChainToConvOp = false; - return; - } + bool canChainBeReplacedWithConvOps() { + const auto &groups = getGroupsInChain(); + if (groups.size() == 0) + return false; + for (const auto &group : groups) + if (group.signalShift == -1 || group.bcastShift == -1 || + group.bcastDist == -1) + return false; + return true; + } - if (!addOp->hasOneUse()) { - canFoldMulAddChainToConvOp = false; - return; + std::unique_ptr getConvMacFromMulOp(arith::MulIOp mulOp) { + auto mulOpLhsDefOp = mulOp.getLhs().getDefiningOp(); + auto mulOpRhsDefOp = mulOp.getRhs().getDefiningOp(); + if (!mulOpLhsDefOp || !mulOpRhsDefOp) + return nullptr; + + // Obtain the broadcast operation feeding into the MulIOp + auto bcastOp = dyn_cast(mulOpRhsDefOp); + if (!bcastOp) { + bcastOp = dyn_cast(mulOpLhsDefOp); + std::swap(mulOpLhsDefOp, mulOpRhsDefOp); } - - // Search for the last add op in the block. - auto usrOp = *addOp->getUsers().begin(); - if (!usrOp || isa(usrOp)) { - canFoldMulAddChainToConvOp = false; - return; + if (!bcastOp) + return nullptr; + + // Obtain the ext or ext->shift op feeding into the MulIOp + aievec::ExtOp extOp = nullptr; + aievec::ShiftOp shiftOp = nullptr; + shiftOp = dyn_cast(mulOpLhsDefOp); + if (shiftOp) + extOp = shiftOp.getLhs().getDefiningOp(); + else + extOp = dyn_cast(mulOpLhsDefOp); + + // XXX: Actually, ExtOp might not exist but should work anyway. + // XXX: Should it, though? + if (!extOp) + return nullptr; + + Value lhs = extOp.getSource(); + Value rhs = bcastOp.getSource(); + uint8_t shift = 0; + if (shiftOp) { + auto shiftConstDefOp = + shiftOp.getShift().getDefiningOp(); + if (shiftConstDefOp) { + auto shiftAttr = cast(shiftConstDefOp.getValue()); + auto vType = cast(mulOp.getResult().getType()); + shift = 8 * shiftAttr.getInt() / getElementSizeInBits(vType); + } } + uint8_t bcastIdx = bcastOp.getIdx(); + return std::make_unique(lhs, rhs, shift, bcastIdx); + } - arith::AddIOp curAddOp = addOp; - // bcastOpSourceVec is a container to trace the order of broadcast ops' - // source in the chain. - SmallVector bcastOpSourceVec; - - // Identify the chain and build a mul add Chain map by recording the def of - // mul ops. - buildChainMap(curAddOp, hasMulConv, acc, macChainMap, bcastOpSourceVec); - - if (macChainMap.empty() || - std::any_of(macChainMap.begin(), macChainMap.end(), - [](const auto &p) { return p.second.size() < 2; })) { - canFoldMulAddChainToConvOp = false; - return; + std::unique_ptr getConvMacFromAddOp(arith::AddIOp addOp) { + // Make sure at least one of them is a multiplication, and the other one + // is the accumulator coming form upchain. + auto mulOp = addOp.getLhs().getDefiningOp(); + Value acc = addOp.getRhs(); + if (!mulOp) { + mulOp = addOp.getRhs().getDefiningOp(); + acc = addOp.getLhs(); } - - // Since we trace the order forwards, now reverse the vector. - std::reverse(bcastOpSourceVec.begin(), bcastOpSourceVec.end()); - - auto getConstantIdx = [](Value v) { - aievec::UPDOp bcastUPDOp = cast(v.getDefiningOp()); - SmallVector indices(bcastUPDOp.getIndices().begin(), - bcastUPDOp.getIndices().end()); - Value innerMostIdx = indices[indices.size() - 1]; - int64_t val = -1; - if (auto idxDefOp = innerMostIdx.getDefiningOp()) { - if (auto constOp = dyn_cast(idxDefOp)) { - val = cast(constOp.getValue()).getInt(); + if (!mulOp) + return nullptr; + + // Get the parameters of the convolution from the operands of the MulIOp + auto convMac = getConvMacFromMulOp(mulOp); + if (!convMac) + return nullptr; + + // If both sides are MulIOp, we might be at the top of the chain + auto upChainAccMulOp = acc.getDefiningOp(); + if (upChainAccMulOp) { + auto convMac2 = getConvMacFromMulOp(upChainAccMulOp); + // XXX: We pre-sort the top two MACs to make sure that an undefined + // XXX: accumulator ends up on top of the chain. + // XXX: But it might not be necessary? CHECK! + if (convMac2 && convMac->lhs == convMac2->lhs && + convMac->rhs == convMac->rhs) { + if (convMac->bcastIdx < convMac2->bcastIdx && + convMac->shift < convMac2->shift) { + convMac2->topOfChainMulConv = std::move(convMac); + convMac2->acc = acc; + return convMac2; + } else if (convMac->bcastIdx > convMac2->bcastIdx && + convMac->shift > convMac2->shift) { + convMac->topOfChainMulConv = std::move(convMac2); + convMac->acc = acc; + return convMac; + } else { + // WARNING: In this situation, the chain is ambiguous and picking one + // WARNING: option over the other may result in a successful + // WARNING: and/or better replacement. Here, we are assuming that + // WARNING: is going to be either one or the other, or it won't + // WARNING: matter. } + } else { + convMac->topOfChainMulConv = std::move(convMac2); } - return val; - }; - - // If broadcast ops' sources are from the same memref, sort the broadcast - // ops by an increasing order of memrefs' constant indices. - std::sort(bcastOpSourceVec.begin(), bcastOpSourceVec.end(), - [&](const Value &a, const Value &b) { - aievec::UPDOp bcastUPDOpA = - cast(a.getDefiningOp()); - aievec::UPDOp bcastUPDOpB = - cast(b.getDefiningOp()); - if (bcastUPDOpA.getSource() == bcastUPDOpB.getSource()) { - return getConstantIdx(a) <= getConstantIdx(b); - } - return true; - }); - - unsigned maxGroupSize = resultElWidth == 16 ? 4 : 8; - - // Legality check for the mul add chain, and collect the ops that can be - // transformed to mul_conv and mul_conv. - if (!collectFusedOps(maxGroupSize, dupFactor, bcastOpSourceVec, - groupFusedOps, macChainMap)) { - canFoldMulAddChainToConvOp = false; - return; } + convMac->acc = acc; + return convMac; + } - if (std::any_of(groupFusedOps.begin(), groupFusedOps.end(), - [](const auto &ops) { return ops.size() < 2; })) { - canFoldMulAddChainToConvOp = false; + LongestConvMACChainAnalysis(arith::AddIOp addOp) { + std::unique_ptr macConvChainElem = getConvMacFromAddOp(addOp); + if (!macConvChainElem) return; + + if (macConvChainElem->acc) { + auto upChainAddOp = macConvChainElem->acc.getDefiningOp(); + if (upChainAddOp) { + auto &upChainChainAnalysis = + am->getChildAnalysis(upChainAddOp); + if (upChainChainAnalysis.convMacChain) { + convMacChain = std::move(upChainChainAnalysis.convMacChain); + convMacChain->push_back(std::move(macConvChainElem)); + return; + } + } } - canFoldMulAddChainToConvOp = true; + assert(!convMacChain && "Convolution MAC chain unexpectedly not empty"); + convMacChain = std::make_unique(); + if (macConvChainElem->topOfChainMulConv) + convMacChain->push_back(std::move(macConvChainElem->topOfChainMulConv)); + convMacChain->push_back(std::move(macConvChainElem)); } - - MulDefMapTy macChainMap; - SmallVector, 8> groupFusedOps; - unsigned dupFactor; - bool hasMulConv; - Value acc; - bool canFoldMulAddChainToConvOp; }; - -// This conversion pattern folds a mul add chain into mul_conv and mac_conv -// ops. We can handle the mul add chain with a random order. +// HACK: For some reason, it's not possible to access the analysis manager from +// HACK: within an analysis, but we need it to build the analysis recursively. +// HACK: If there is a good reason not to do this, we should find an +// HACK: alternative way to build the MAC chain. +AnalysisManager *LongestConvMACChainAnalysis::am = nullptr; + +// This conversion pattern folds a MAC chain into mul_conv and mac_conv +// ops. We can handle the mul MAC with a random order. struct FoldMulAddChainToConvOpPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -399,152 +362,100 @@ struct FoldMulAddChainToConvOpPattern LogicalResult matchAndRewrite(arith::AddIOp srcOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - canFoldMulAddChainToConvOpAnalysis analysis = - am.getChildAnalysis(srcOp); - if (!analysis.canFoldMulAddChainToConvOp) + auto &convMacChainAnalysis = + am.getChildAnalysis(srcOp); + auto &convMacChain = convMacChainAnalysis.convMacChain; + if (!convMacChain) return failure(); - SmallVector, 8> groupFusedOps = - analysis.groupFusedOps; - MulDefMapTy macChainMap = analysis.macChainMap; - unsigned dupFactor = analysis.dupFactor; - bool hasMulConv = analysis.hasMulConv; - Value acc = analysis.acc; - - for (auto fusedOps : groupFusedOps) { - arith::MulIOp mulOp = (*fusedOps.begin()); - - // Get the mul op's lhs and rhs defining ops. We keep splat op at rhs. - if (isa(mulOp->getOperand(0).getDefiningOp())) { - Value left = mulOp->getOperand(0); - Value right = mulOp->getOperand(1); - mulOp->setOperand(0, right); - mulOp->setOperand(1, left); - } - - Value lhs = mulOp->getOperand(0); - Value rhs = mulOp->getOperand(1); - - VectorType vType = cast(mulOp.getResult().getType()); - Type sType = vType.getElementType(); - IntegerType iType = cast(sType); - unsigned width = iType.getWidth() <= 8 ? 32 : 64; - int32_t M = iType.getWidth() == 8 ? 32 : 16; - int32_t N = iType.getWidth() == 8 ? 8 : 4; - - Type ctype = mlir::IntegerType::get(iType.getContext(), width); - Type opType = VectorType::get(vType.getShape(), ctype); - - aievec::BroadcastOp bcastOp = - cast(rhs.getDefiningOp()); - aievec::UPDOp bcastUPDOp = - cast(bcastOp.getSource().getDefiningOp()); - SmallVector indices(bcastUPDOp.getIndices().begin(), - bcastUPDOp.getIndices().end()); - unsigned lanes = 512 / getElementSizeInBits(vType); - VectorType resType = createVectorType(lanes, sType); - Value innerMostIdx = indices[indices.size() - 1]; - Value newIdx = innerMostIdx; - int64_t val = -1; - int64_t defIdx = -1; - // Transfer - // %c32 = arith.constant 32 : index - // %1 = aievec.upd %arg1[%c32] {index = 0 : i8} : vector<32xi8> - // %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<32xi8> - // to - - // %c0 = arith.constant 0 : index - // %1 = aievec.upd %arg1[%c0] {index = 0 : i8} : vector<64xi8> - // %2 = aievec.broadcast %1 {idx = 32 : i8} : vector<32xi8> - if (auto idxDefOp = innerMostIdx.getDefiningOp()) { - if (auto constOp = dyn_cast(idxDefOp)) { - val = cast(constOp.getValue()).getInt(); - if (val) { - defIdx = val / lanes * lanes; - val %= lanes; - newIdx = rewriter.create( - constOp.getLoc(), - rewriter.getIntegerAttr(constOp.getType(), defIdx)); - indices[indices.size() - 1] = newIdx; - } - } + auto loc = srcOp.getLoc(); + VectorType vecTy = cast(srcOp.getResult().getType()); + unsigned elemWidth = cast(vecTy.getElementType()).getWidth(); + unsigned accWidth = elemWidth <= 8 ? 32 : 64; + int32_t M = elemWidth == 8 ? 32 : 16; + int32_t N = elemWidth == 8 ? 8 : 4; + + Type wideElemTy = IntegerType::get(getContext(), accWidth); + Type accVecTy = VectorType::get(vecTy.getShape(), wideElemTy); + + const auto &groups = convMacChainAnalysis.getGroupsInChain(); + Value grpAcc = (*convMacChain)[groups[0].fromIdx]->acc; + if (grpAcc) + grpAcc = rewriter + .create(srcOp.getLoc(), accVecTy, grpAcc, + /*shift=*/0) + .getResult(); + for (const auto &group : groups) { + Value grpLhs = (*convMacChain)[group.fromIdx]->lhs; + Value grpRhs = (*convMacChain)[group.fromIdx]->rhs; + auto filterVecTy = cast(grpRhs.getType()); + auto signalVecTy = cast(grpLhs.getType()); + // Sort out the vector used as filter + // If the length of the filter is half that of the signal, concatenate + // the filter with itself. + if (2 * filterVecTy.getShape()[0] == signalVecTy.getShape()[0]) + grpRhs = + rewriter + .create( + loc, signalVecTy, SmallVector({grpRhs, grpRhs})) + .getResult(); + // If the filter has duplicate elements, pack them. + if (group.bcastDist == 2) + grpRhs = + rewriter + .create(loc, signalVecTy, grpRhs, /*mode=*/0) + .getResult(); + // If the first element of the filter to be used is not 0, shift the + // filter to align the first element to the beginning. + if (group.bcastShift) { + int32_t shiftBytes = + group.bcastShift * getElementSizeInBits(filterVecTy) >> + (3 + group.bcastDist - 1); + auto shiftBytesCst = + rewriter + .create( + loc, rewriter.getI32IntegerAttr(shiftBytes)) + .getResult(); + grpRhs = rewriter + .create(grpRhs.getDefiningOp()->getLoc(), + signalVecTy, grpRhs, grpRhs, + shiftBytesCst) + .getResult(); } - - aievec::UPDOp newBcastOp = bcastUPDOp; - - // Rewrite the upd op with maximum vector lanes - if (vType != resType) { - newBcastOp = rewriter.create( - bcastUPDOp->getLoc(), resType, bcastUPDOp.getSource(), indices, 0, - 0, TypedValue(nullptr)); - } - - // Since we do not need to use duplicated data like in AIE1, if a - // dup-factor exists, we extract the identical data by shuffle op. We use - // mode 0 to extract the elements with even indices for i8 type data. - Operation *shuffleOp = newBcastOp; - if (dupFactor != 1) { - shuffleOp = rewriter.create( - newBcastOp.getLoc(), resType, newBcastOp.getResult(), 0); + // Sort out the vector used as signal + // If the signal to be convolved doesn't start at element 0, shift the + // signal to align the first element to the beginning. + if (group.signalShift) { + int32_t shiftBytes = + group.signalShift * getElementSizeInBits(signalVecTy) >> 3; + auto shiftBytesCst = + rewriter + .create( + loc, rewriter.getI32IntegerAttr(shiftBytes)) + .getResult(); + grpLhs = rewriter + .create(loc, signalVecTy, grpLhs, grpLhs, + shiftBytesCst) + .getResult(); } - - int32_t shiftBytes = (bcastOp.getIdx() + val) * - getElementSizeInBits(vType) / 8 / dupFactor; - - rhs = shuffleOp->getResult(0); - - // Generate a shift_bytes operation for rhs if the start position is not - // 0. - if (shiftBytes) { - arith::ConstantOp constOp = rewriter.create( - shuffleOp->getLoc(), rewriter.getI32IntegerAttr(shiftBytes)); - rhs = rewriter.create( - shuffleOp->getLoc(), - cast(shuffleOp->getResult(0).getType()), - shuffleOp->getResult(0), shuffleOp->getResult(0), - constOp.getResult()); - } - - aievec::UPDOp lUPDOp = cast(lhs.getDefiningOp()); - SmallVector lIndices; - lIndices.append(lUPDOp.getIndices().begin(), lUPDOp.getIndices().end()); - - lhs = rewriter.create(lUPDOp->getLoc(), resType, - lUPDOp.getSource(), lIndices, 0, 0, - TypedValue(nullptr)); - - if (!hasMulConv && acc.getType() != opType) { - auto upsOp = rewriter.create( - acc.getDefiningOp()->getLoc(), opType, acc, shiftParam); - acc = upsOp->getResult(0); - } - - Operation *convOp = nullptr; - if (fusedOps == groupFusedOps.back()) { - if (hasMulConv) { - convOp = rewriter.create(srcOp->getLoc(), opType, - lhs, rhs, M, N); - hasMulConv = false; - } else { - convOp = rewriter.create( - srcOp->getLoc(), opType, lhs, rhs, acc, M, N, false); - } - rewriter.replaceOpWithNewOp( - srcOp, vType, convOp->getResult(0), shiftParam); - return success(); - } else { - if (hasMulConv) { - convOp = rewriter.create(srcOp->getLoc(), opType, - lhs, rhs, M, N); - hasMulConv = false; - } else { - convOp = rewriter.create( - srcOp->getLoc(), opType, lhs, rhs, acc, M, N, false); - } - } - acc = convOp->getResult(0); + // Generate a convolution operation for the group + // If there is no upchain accumulator, use a mul_conv; use a mac_conv + // otherwise. + if (!grpAcc) + grpAcc = rewriter + .create(srcOp.getLoc(), accVecTy, + grpLhs, grpRhs, M, N) + .getResult(); + else + grpAcc = + rewriter + .create(srcOp.getLoc(), accVecTy, grpLhs, + grpRhs, grpAcc, M, N, false) + .getResult(); } - - llvm_unreachable("the conversion should end with srs op."); + rewriter.replaceOpWithNewOp(srcOp, vecTy, grpAcc, + shiftParam); + return success(); } AnalysisManager &am; @@ -553,11 +464,12 @@ struct FoldMulAddChainToConvOpPattern void configureAIEVecConvOpTransformationLegalizations(ConversionTarget &target, AnalysisManager &am) { - target.addLegalDialect(); + LongestConvMACChainAnalysis::am = &am; + target.addLegalDialect(); target.addLegalDialect(); target.addDynamicallyLegalOp([&am](arith::AddIOp op) { - return !am.getChildAnalysis(op) - .canFoldMulAddChainToConvOp; + auto &convAnalysis = am.getChildAnalysis(op); + return !convAnalysis.canChainBeReplacedWithConvOps(); }); } @@ -567,3 +479,80 @@ void populateAIEVecConvOpTransformationPatterns(RewritePatternSet &patterns, patterns.add(patterns.getContext(), am, shiftParam); } + +struct AIEVecConvAnalysis : public AIEVecConvAnalysisBase { + AIEVecConvAnalysis() = default; + using ConvMacChain = LongestConvMACChainAnalysis::ConvMacChain; + using ConvMacChainGroupList = + LongestConvMACChainAnalysis::ConvMacChainGroupList; + + void runOnOperation() override { + markAllAnalysesPreserved(); + AnalysisManager am = getAnalysisManager(); + LongestConvMACChainAnalysis::am = &am; + func::FuncOp func = getOperation(); + + // Compute all the chains + func.walk([&](arith::AddIOp addOp) { + if (isa(addOp.getResult().getType())) + am.getChildAnalysis(addOp); + }); + + // Sort the chains, ready to split by group + func.walk([&](arith::AddIOp addOp) { + if (isa(addOp.getResult().getType())) { + auto &analysis = + am.getChildAnalysis(addOp); + if (analysis.convMacChain) + analysis.sortChain(); + } + }); + + if (printResult) { + func.walk([&](arith::AddIOp addOp) { + if (isa(addOp.getResult().getType())) { + auto &macChainAnalysis = + am.getChildAnalysis(addOp); + if (macChainAnalysis.canChainBeReplacedWithConvOps()) { + addOp.print(llvm::outs()); + llvm::outs() << " is at the end of a convolution MAC Chain:\n"; + listChain(macChainAnalysis.convMacChain, + macChainAnalysis.getGroupsInChain()); + } + } + }); + } + } + + void listChain(const std::unique_ptr &chain, + const ConvMacChainGroupList &groups) const { + uint64_t gIdx = 0; + for (const auto &group : groups) { + llvm::outs() << "-------------- GROUP " << std::to_string(gIdx) + << " --------------\n"; + llvm::outs() << " Signal Shift: " << std::to_string(group.signalShift) + << " Kernel Shift: " << std::to_string(group.bcastShift) + << " Kernel Duplication: " + << std::to_string(group.bcastDist) << "\n"; + for (uint64_t i = group.fromIdx; i < group.toIdx; i++) { + auto shift = (*chain)[i]->shift; + auto bcastIdx = (*chain)[i]->bcastIdx; + auto lhsOp = (*chain)[i]->lhs.getDefiningOp(); + auto rhsOp = (*chain)[i]->rhs.getDefiningOp(); + if (!(*chain)[i]->acc) + llvm::outs() << " [mul_conv]\n"; + llvm::outs() << " [Shift: " << std::to_string(shift) << "]: "; + lhsOp->print(llvm::outs()); + llvm::outs() << "\n [Bcast: " << std::to_string(bcastIdx) << "]: "; + rhsOp->print(llvm::outs()); + llvm::outs() << "\n"; + } + gIdx++; + } + llvm::outs() << "-------------------------------------\n"; + } +}; + +std::unique_ptr xilinx::aievec::createAIEVecConvolutionAnalysisPass() { + return std::make_unique(); +} diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp index 0f79414a9e..f3a6c11d88 100644 --- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp +++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -16,6 +17,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/TypeSwitch.h" #include "VectorToAIEVecConversions.h" @@ -31,6 +33,50 @@ using namespace xilinx::aievec; // Utility functions //===----------------------------------------------------------------------===// +// Return the offset of a given transfer read operation with regards to the +// specified vector type. If the read is aligned to the specified alignment +// parameter (in bits), then the offset is 0. Otherwise, the offset is the +// number of elements past the immediately preceding aligned vector length. +template < + typename TransferReadLikeOp, + typename = std::enable_if_t< + std::is_same_v || + std::is_same_v>> +static int64_t getTransferReadAlignmentOffset(TransferReadLikeOp readOp, + VectorType vType, + int64_t alignment) { + // TODO: Add support for cases where the index is not comming from an + // TODO: `affine.apply` op or when the affine map has more than one + // TODO: dimension. We also need to address the case where the index is an + // TODO: induction variable. + auto innerMostIndex = readOp.getIndices().back(); + auto vectorLength = vType.getShape().back(); + auto idxDefOp = innerMostIndex.getDefiningOp(); + if (!idxDefOp) + return 0L; + int64_t vectorLengthAlignmentOffset = + TypeSwitch(idxDefOp) + .Case([&](auto constantOp) { + return cast(constantOp.getValue()).getInt() % + vectorLength; + }) + .template Case([&](auto applyOp) { + if (applyOp.getAffineMap().getNumDims() == 1) + return applyOp.getAffineMap().compose(ArrayRef{0})[0] % + vectorLength; + return 0L; + }) + .Default([&](auto) { + // XXX: If we can't determine the offset, we assume the access is + // XXX: aligned. + return 0L; + }); + int64_t absoluteAlignmentOffset = alignment / getElementSizeInBits(vType); + if (vectorLengthAlignmentOffset % absoluteAlignmentOffset) + return vectorLengthAlignmentOffset; + return 0; +} + // Given the LHS and RHS of an `arith::AddIOp`, if one of them is defined by an // `arith::MulIOp`, return a tuple with the `lhs`, `rhs`, and `acc` of the MAC // operation that can replace them. @@ -87,6 +133,73 @@ static aievec::MulElemOp createMulElemAieML(ConversionPatternRewriter &rewriter, return mulElemOp; } +// Return the list of attributes that configure an `aievec.select` op to +// perform a rotation of the input vector by `rotation` number of elements. +// The attribute values depend on the vector type of the select operation. +static SmallVector +buildAttributeListForRotationSelectOp(PatternRewriter &rewriter, VectorType vTy, + int64_t rotation) { + unsigned width = 0; + auto elemTy = vTy.getElementType(); + auto intTy = dyn_cast(elemTy); + if (intTy) + width = intTy.getWidth(); + StringAttr attr0 = rewriter.getStringAttr("0"); + StringAttr attr0x06040200 = rewriter.getStringAttr("0x06040200"); + StringAttr attr0x0e0c0a08 = rewriter.getStringAttr("0x0e0c0a08"); + StringAttr attr0x2103 = rewriter.getStringAttr("0x2103"); + StringAttr attr0x3210 = rewriter.getStringAttr("0x3210"); + StringAttr selectAttrName = rewriter.getStringAttr("select"); + StringAttr xoffsetsAttrName = rewriter.getStringAttr("xoffsets"); + StringAttr xoffsetsHiAttrName = rewriter.getStringAttr("xoffsets_hi"); + StringAttr xsquareAttrName = rewriter.getStringAttr("xsquare"); + StringAttr xstartAttrName = rewriter.getStringAttr("xstart"); + StringAttr yoffsetsAttrName = rewriter.getStringAttr("yoffsets"); + StringAttr yoffsetsHiAttrName = rewriter.getStringAttr("yoffsets_hi"); + StringAttr ysquareAttrName = rewriter.getStringAttr("ysquare"); + StringAttr ystartAttrName = rewriter.getStringAttr("ystart"); + + switch (width) { + case 16: + if (rotation % 2) { + int64_t xstart = rotation + 1; + int64_t ystart = rotation - 1; + return SmallVector( + {{selectAttrName, rewriter.getStringAttr("0x11111111")}, + {xoffsetsAttrName, attr0x06040200}, + {xoffsetsHiAttrName, attr0x0e0c0a08}, + {xsquareAttrName, attr0x2103}, + {xstartAttrName, rewriter.getStringAttr(std::to_string(xstart))}, + {yoffsetsAttrName, rewriter.getStringAttr("0x0503010f")}, + {yoffsetsHiAttrName, rewriter.getStringAttr("0x0d0b0907")}, + {ysquareAttrName, attr0x2103}, + {ystartAttrName, rewriter.getStringAttr(std::to_string(ystart))}}); + } else { + return SmallVector( + {{selectAttrName, attr0}, + {xoffsetsAttrName, attr0x06040200}, + {xoffsetsHiAttrName, attr0x0e0c0a08}, + {xsquareAttrName, attr0x3210}, + {xstartAttrName, rewriter.getStringAttr(std::to_string(rotation))}, + {yoffsetsAttrName, attr0}, + {yoffsetsHiAttrName, attr0}, + {ysquareAttrName, attr0}, + {ystartAttrName, attr0}}); + } + break; + case 32: + return SmallVector( + {{selectAttrName, attr0}, + {xoffsetsAttrName, rewriter.getStringAttr("0x76543210")}, + {xsquareAttrName, attr0x3210}, + {xstartAttrName, rewriter.getStringAttr(std::to_string(rotation))}, + {yoffsetsAttrName, attr0}, + {ysquareAttrName, attr0}, + {ystartAttrName, attr0}}); + } + return {}; +} + namespace xilinx { namespace aievec { @@ -301,8 +414,9 @@ struct UPDOpEffectiveAccessSizeAnalysis { }; //===----------------------------------------------------------------------===// -// Lowering patterns +// Rewrite patterns //===----------------------------------------------------------------------===// + // This pattern fold `vector.extract` and `vector.broadcast` into // `aievec.broadcast` for aie-ml struct FoldVectorExtractAndBroadcastToAIEBroadcast @@ -314,17 +428,28 @@ struct FoldVectorExtractAndBroadcastToAIEBroadcast ConversionPatternRewriter &rewriter) const override { auto extOp = - dyn_cast(bcastOp.getSource().getDefiningOp()); + dyn_cast(adaptor.getSource().getDefiningOp()); if (!extOp) return failure(); auto src = extOp.getVector(); auto pos = extOp.getPosition(); - VectorType resultType = bcastOp.getResult().getType().cast(); - - rewriter.replaceOpWithNewOp( - bcastOp, resultType, src, cast(pos[0]).getInt()); + int64_t posVal = cast(pos[0]).getInt(); + VectorType srcVecType = cast(src.getType()); + VectorType resultType = cast(bcastOp.getResult().getType()); + if (srcVecType != resultType) { + if (srcVecType.getNumElements() != 2 * resultType.getNumElements()) + return failure(); + int8_t half = static_cast(posVal / resultType.getNumElements()); + posVal -= half * resultType.getNumElements(); + src = rewriter + .create(extOp.getLoc(), resultType, src, + rewriter.getI8IntegerAttr(half)) + .getResult(); + } + rewriter.replaceOpWithNewOp(bcastOp, resultType, src, + posVal); return success(); } @@ -659,14 +784,15 @@ struct LowerVectorTransferReadToAIEUPD using OpConversionPattern::OpConversionPattern; LowerVectorTransferReadToAIEUPD(MLIRContext *context, AnalysisManager &am, - int32_t maxVectorSize = 256) + int64_t minVectorSize, int64_t maxVectorSize, + int64_t alignment, int64_t maxLoadSize) : OpConversionPattern(context), am(am), - maxVectorSize(maxVectorSize) {} + minVectorSize(minVectorSize), maxVectorSize(maxVectorSize), + vectorAlignment(alignment), maxLoadSize(maxLoadSize) {} LogicalResult matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - // == Handle invalid read operations == // Masked loads if (readOp.getMask()) return readOp.emitError() << "AIE doesn't support masked loads."; @@ -680,75 +806,43 @@ struct LowerVectorTransferReadToAIEUPD if (map.isConstant()) return failure(); - // When a transfer read with a constant innermost index is not aligned, we - // get the corresponding aligned load followed by an aievec.shift op. - // Example: - // Convert - - // %0 = vector.transfer_read %arg1[16] : vector<32xi8> - // %1 = vector.transfer_read %arg1[34] : vector<32xi8> - // - // to - - // - // %0 = aievec.upd %arg1[0] : vector<32xi8> - // %1 = aievec.upd %arg1[32] : vector<32xi8> - // %2 = aievec.shift %0, %1 {shift = 16 : i32} : vector<32xi8> - // %3 = aievec.upd %arg1[64] : vector<32xi8> - // %4 = aievec.shift %2, %3 {shift = 2 : i32} : vector<32xi8> - // - SmallVector indices(adaptor.getIndices().begin(), - adaptor.getIndices().end()); - Value innerMostIdx = indices[indices.size() - 1]; - Value newIdx = innerMostIdx; - VectorType vType = readOp.getVector().getType().cast(); - int32_t lanes = getVectorLaneSize(vType); - - if (auto defOp = innerMostIdx.getDefiningOp()) { - if (auto constOp = dyn_cast(defOp)) { - int64_t val = constOp.getValue().cast().getInt(); - if (val) { - int64_t offset = val % lanes; - int64_t idx = val / lanes * lanes; - newIdx = rewriter.create( - constOp.getLoc(), - rewriter.getIntegerAttr(constOp.getType(), idx)); - indices[indices.size() - 1] = newIdx; - int32_t shiftBytes = offset * getElementSizeInBits(vType) / 8; - - if (shiftBytes) { - auto updOp = rewriter.create( - readOp.getLoc(), vType, adaptor.getSource(), indices, 0, 0, - TypedValue(nullptr)); - newIdx = rewriter.create( - constOp.getLoc(), - rewriter.getIntegerAttr(constOp.getType(), idx + lanes)); - indices[indices.size() - 1] = newIdx; - // Load the next vector lanes - auto nextUpdOp = rewriter.create( - readOp.getLoc(), vType, adaptor.getSource(), indices, 0, 0, - TypedValue(nullptr)); - - arith::ConstantOp constOp = rewriter.create( - readOp.getLoc(), rewriter.getI32IntegerAttr(shiftBytes)); - rewriter.replaceOpWithNewOp( - readOp, vType, updOp->getResult(0), nextUpdOp->getResult(0), - constOp.getResult()); - } else { - rewriter.replaceOpWithNewOp( - readOp, vType, adaptor.getSource(), indices, 0, 0, - TypedValue(nullptr)); - } - return success(); - } - } - } - rewriter.replaceOpWithNewOp( - readOp, vType, adaptor.getSource(), indices, 0, 0, + // Misaligned accesses + auto vType = readOp.getVectorType(); + if (getTransferReadAlignmentOffset(adaptor, vType, vectorAlignment) != 0) + return failure(); + + // Invalid vector size. + // We can handle cases where the vector size is: + // 1) the minimum vector size + // 2) a square multiple of the alignment size and up to the maximum + // vector size. + int64_t vSize = vType.getNumElements() * vType.getElementTypeBitWidth(); + if (vSize > maxVectorSize || + (vSize % vectorAlignment && vSize != minVectorSize)) + return failure(); + // We can deal with linked update instructions when the vector size is + // exactly twice the load size. This could change in future architectures + if (vSize > maxLoadSize && vSize != maxLoadSize * 2) + return failure(); + int64_t multiplicity = vSize / vectorAlignment; + if ((vSize > minVectorSize) && std::bitset<8>(multiplicity).count() != 1) + return failure(); + + auto updOp = rewriter.create( + readOp.getLoc(), vType, adaptor.getSource(), adaptor.getIndices(), 0, 0, TypedValue(nullptr)); + if (vSize > maxLoadSize) { + updOp = rewriter.create( + readOp.getLoc(), vType, adaptor.getSource(), adaptor.getIndices(), + maxLoadSize, 1, updOp.getResult()); + } + rewriter.replaceOp(readOp, updOp.getResult()); + return success(); } AnalysisManager &am; - int32_t maxVectorSize; + int64_t minVectorSize, maxVectorSize, vectorAlignment, maxLoadSize; }; // XXX: Notice that this template doesn't verify that the vector element type @@ -1440,36 +1534,151 @@ struct LowerVectorReductionAddBfloat16Op } }; -// If a UPD op is loading a vector twice the size of the architecture -// vector size, split it into a high and low load into the accumulator. -// TODO: This is a process we may want to include as part of the -// TODO: legalization of `vector.transfer_read`. -struct SplitUPDOpOnAccPattern : public OpConversionPattern { +// Convert a `vector.extract_strided_slice` op on 1D vectors into an +// `aievec.select` + `aievec.ext` op. +struct LowerVectorExtractStridedSliceOpAIEv1Pattern + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::ExtractStridedSliceOp extractOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto vType = extractOp.getVectorType(); + if (vType.getRank() != 1) + return failure(); + + int64_t stride = cast(adaptor.getStrides()[0]).getInt(); + if (stride != 1) + return failure(); + + // AIE doesn't support select operations on i8 + if (getElementSizeInBits(vType) == 8) + return extractOp.emitError() + << "AIEv1 doesn't support select ops on int8 types"; + + // We only accept the case where we are extracting a slice half the size of + // the input vector. + int64_t size = cast(adaptor.getSizes()[0]).getInt(); + if (vType.getNumElements() != 2 * size) + return failure(); + + int64_t offset = cast(adaptor.getOffsets()[0]).getInt(); + auto selectOp = rewriter.create( + extractOp.getLoc(), vType, adaptor.getVector(), + buildAttributeListForRotationSelectOp(rewriter, vType, offset)); + rewriter.replaceOpWithNewOp(extractOp, extractOp.getType(), + selectOp.getResult(), + rewriter.getI8IntegerAttr(0)); + + return success(); + } +}; + +// Convert a `vector.extract_strided_slice` op on 1D vectors into an +// `aievec.shift` op. +struct LowerVectorExtractStridedSliceOpAIEMLPattern + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::ExtractStridedSliceOp extractOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto vType = cast(adaptor.getVector().getType()); + if (vType.getRank() != 1) + return failure(); + + int64_t stride = cast(adaptor.getStrides()[0]).getInt(); + if (stride != 1) + return failure(); + + // We only accept the case where we are extracting a slice half the size of + // the input vector. + int64_t size = cast(adaptor.getSizes()[0]).getInt(); + if (vType.getNumElements() != 2 * size) + return failure(); + + auto shortVecType = cast(extractOp.getResult().getType()); + auto bottomHalf = rewriter + .create( + extractOp.getLoc(), shortVecType, + adaptor.getVector(), rewriter.getI8IntegerAttr(0)) + .getResult(); + auto topHalf = rewriter + .create(extractOp.getLoc(), shortVecType, + adaptor.getVector(), + rewriter.getI8IntegerAttr(1)) + .getResult(); + int64_t offset = cast(adaptor.getOffsets()[0]).getInt(); + int32_t shiftBytes = offset * getElementSizeInBits(vType) / 8; + auto shiftBytesConstOp = rewriter.create( + extractOp.getLoc(), rewriter.getIntegerType(32), + rewriter.getI32IntegerAttr(shiftBytes)); + rewriter.replaceOpWithNewOp( + extractOp, shortVecType, bottomHalf, topHalf, shiftBytesConstOp); + + return success(); + } +}; + +// Replaces a short UPD op with a wide one followed by an ext op of the bottom +// half. +struct ExpandUPDToUPDAndExtPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; - SplitUPDOpOnAccPattern(MLIRContext *context, AnalysisManager &am, - int32_t maxVectorSize = 256) - : OpConversionPattern(context), am(am), - maxVectorSize(maxVectorSize) {} + ExpandUPDToUPDAndExtPattern(MLIRContext *context) + : OpConversionPattern(context) {} LogicalResult matchAndRewrite(aievec::UPDOp updOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (am.getChildAnalysis(updOp) - .effectiveSize < 2 * static_cast(maxVectorSize)) + // Verify that we haven't already expanded this one + if (updOp->hasOneUse() && isa(*updOp->getUsers().begin())) return failure(); - auto updOp0 = rewriter.create( - updOp.getLoc(), updOp.getResult().getType(), adaptor.getSource(), - adaptor.getIndices(), 0, 0); - rewriter.replaceOpWithNewOp( - updOp, updOp.getResult().getType(), adaptor.getSource(), - adaptor.getIndices(), 2 * maxVectorSize, 1, updOp0.getResult()); + auto vecType = cast(updOp.getType()); + SmallVector vecShape(vecType.getShape().begin(), + vecType.getShape().end()); + vecShape[vecType.getRank() - 1] *= 2; + auto longVecType = VectorType::get(vecShape, vecType.getElementType()); + auto newUpdOp = rewriter.create( + updOp.getLoc(), longVecType, adaptor.getSource(), adaptor.getIndices(), + adaptor.getOffset(), adaptor.getIndex(), adaptor.getVector()); + rewriter.replaceOpWithNewOp( + updOp, vecType, newUpdOp.getResult(), rewriter.getI8IntegerAttr(0)); + return success(); } +}; - AnalysisManager &am; - int32_t maxVectorSize; +// Replaces a wide UPD op followed by an ext op of the bottom half with a short +// UPD op. +struct FuseExtIntoUPDPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + FuseExtIntoUPDPattern(MLIRContext *context) + : OpConversionPattern(context) {} + + LogicalResult + matchAndRewrite(aievec::ExtOp extOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Verify we are extracting the lower half... + if (extOp.getIndex() != 0) + return failure(); + // ...of a UPDOp + auto updOp = dyn_cast(extOp.getSource().getDefiningOp()); + if (!updOp) + return failure(); + + // Verify that this is a direct upd -> ext pattern + if (!updOp->hasOneUse()) + return failure(); + + rewriter.replaceOpWithNewOp( + extOp, extOp.getType(), updOp.getSource(), updOp.getIndices(), + updOp.getOffset(), updOp.getIndex(), updOp.getVector()); + + return success(); + } }; //===----------------------------------------------------------------------===// @@ -1478,19 +1687,19 @@ struct SplitUPDOpOnAccPattern : public OpConversionPattern { static void populateAIEVecV1ConversionPatterns(RewritePatternSet &patterns, AnalysisManager &am) { - patterns.add( - patterns.getContext(), am, 256); + patterns.add(patterns.getContext(), am, 128, + 512, 128, 256); patterns .add( - patterns.getContext()); + FoldBroadcastToFMAOp, LowerVectorAddIOpToAIEVecAddOp, + LowerVectorExtractStridedSliceOpAIEv1Pattern>(patterns.getContext()); } static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns, AnalysisManager &am) { - patterns.add( - patterns.getContext(), am, 512); + patterns.add(patterns.getContext(), am, 128, + 1024, 256, 1024); patterns.add< LowerVectorAddIOpToAIEVecAddElemOp, LowerVectorAddFOpToAIEVecAddElemOp, @@ -1503,8 +1712,8 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns, LowerVectorReductionAddFloatOp, LowerVectorReductionAddBfloat16Op, FoldVectorExtractAndBroadcastToAIEBroadcast, ConvertMulAddToAIEVecFMAElemOpPattern, - ConvertMulIToAIEVecMulElemOpPattern, ConvertMulFToAIEVecMulElemOpPattern>( - patterns.getContext()); + ConvertMulIToAIEVecMulElemOpPattern, ConvertMulFToAIEVecMulElemOpPattern, + LowerVectorExtractStridedSliceOpAIEMLPattern>(patterns.getContext()); } //===----------------------------------------------------------------------===// @@ -1512,11 +1721,11 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns, //===----------------------------------------------------------------------===// // TODO: Review the validity of these legalizations beyond basic cases. - static void configureAIEVecCommonLegalizations(ConversionTarget &target, AnalysisManager &am) { target.addLegalDialect(); target.addIllegalOp(); + target.addIllegalOp(); target.addDynamicallyLegalOp( [](arith::AddIOp op) { return !isa(op.getType()); }); target.addDynamicallyLegalOp( @@ -1529,10 +1738,6 @@ static void configureAIEVecCommonLegalizations(ConversionTarget &target, static void configureAIEVecV1Legalizations(ConversionTarget &target, AnalysisManager &am) { - target.addDynamicallyLegalOp([&am](xilinx::aievec::UPDOp op) { - return am.getChildAnalysis(op) - .effectiveSize <= 512; - }); target.addDynamicallyLegalOp([](xilinx::aievec::FMAOp op) { auto lhsDefOp = op.getLhs().getDefiningOp(); aievec::ConcatOp concatOp = nullptr; @@ -1563,10 +1768,6 @@ static void configureAIEVecV1Legalizations(ConversionTarget &target, static void configureAIEVecV2Legalizations(ConversionTarget &target, AnalysisManager &am) { target.addLegalOp(); - target.addDynamicallyLegalOp([&am](aievec::UPDOp op) { - return am.getChildAnalysis(op) - .effectiveSize <= 1024; - }); // A set recording the vector lane size and element width supported llvm::SmallSet, 16> laneSizeElWidthPairSet; @@ -1862,6 +2063,60 @@ createLowerVectorToAIEVec(const LowerVectorToAIEVecOptions &options) { return std::make_unique(options); } +//===--------------------------------------------------------------------------- +// Custom canonicalization passes +//===--------------------------------------------------------------------------- + +// This pass widens UPD ops to twice the width followed by an ext op of the +// bottom half. This can be used together with SimplifyUPDOpsPass to find +// additional common subexpressions with UPDs generated from unaligned +// `transfer_read` ops. +struct ExtendUPDOpsPass + : public PassWrapper> { + void runOnOperation() override { + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + ConversionTarget target(*context); + patterns.add(patterns.getContext()); + target.addLegalDialect(); + target.addDynamicallyLegalOp([](aievec::UPDOp op) { + return op.getVector() || + (op->hasOneUse() && isa(*op->getUsers().begin())) || + llvm::all_of(op->getUsers(), + [](Operation *op) { return isa(op); }); + }); + auto func = getOperation(); + if (failed(applyPartialConversion(func, target, std::move(patterns)))) { + signalPassFailure(); + } + } +}; + +// This pass replaces wide UPD ops that are only used by a single ext op of the +// bottom half. This pass undos the work of ExtendUPDOpsPass. +// TODO: This pass can be extended to work with wide UPD ops that are used by +// TODO: a single ext op of the top half, which might be a good opportunity to +// TODO: further optimize wide UPDs. +struct SimplifyUPDOpsPass + : public PassWrapper> { + void runOnOperation() override { + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + ConversionTarget target(*context); + patterns.add(patterns.getContext()); + target.addLegalDialect(); + target.addDynamicallyLegalOp([](aievec::ExtOp op) { + auto defOp = op.getSource().getDefiningOp(); + return !defOp || !isa(defOp) || !defOp->hasOneUse() || + op.getIndex() != 0; + }); + auto func = getOperation(); + if (failed(applyPartialConversion(func, target, std::move(patterns)))) { + signalPassFailure(); + } + } +}; + //============================================================================// //=============== Main Vector2AIEVec Pipeline Configuration ==================// //============================================================================// @@ -1870,6 +2125,11 @@ void xilinx::aievec::buildLowerVectorToAIEVec( OpPassManager &pm, const LowerVectorToAIEVecOptions &options) { // Add lowering from `Vector` to `AIEVec` pm.addPass(createLowerVectorToAIEVec(options)); + pm.addPass(createCanonicalizerPass()); + + // Simplify UPD ops + pm.addPass(std::make_unique()); pm.addPass(createCSEPass()); + pm.addPass(std::make_unique()); pm.addPass(createCanonicalizerPass()); } diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp index b460d0c30e..ce032fc08f 100644 --- a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp +++ b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp @@ -14,6 +14,7 @@ #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" +#include "llvm/ADT/TypeSwitch.h" #include "VectorToVectorConversions.h" @@ -30,25 +31,46 @@ using namespace xilinx::aievec; //============================================================================// // Return the offset of a given transfer read operation with regards to the -// specified vector type. If the read is aligned size of the vector type, then -// the offset is 0. Otherwise, the offset is the number of elements past the -// immediately preceding aligned address. +// specified vector type. If the read is aligned to the specified alignment +// parameter (in bits), then the offset is 0. Otherwise, the offset is the +// number of elements past the immediately preceding aligned vector length. template < typename TransferReadLikeOp, typename = std::enable_if_t< std::is_same_v || std::is_same_v>> -static unsigned getTransferReadAlignmentOffset(TransferReadLikeOp op, - VectorType vType) { +static int64_t getTransferReadAlignmentOffset(TransferReadLikeOp readOp, + VectorType vType, + int64_t alignment) { // TODO: Add support for cases where the index is not comming from an - // TODO: `affine.apply` op. E.g.: when the index is a constant. - auto innerMostIndex = op.getIndices().back(); + // TODO: `affine.apply` op or when the affine map has more than one + // TODO: dimension. We also need to address the case where the index is an + // TODO: induction variable. + auto innerMostIndex = readOp.getIndices().back(); auto vectorLength = vType.getShape().back(); - if (auto defOp = innerMostIndex.getDefiningOp()) - if (auto applyOp = dyn_cast(defOp)) - if (applyOp.getAffineMap().getNumDims() == 1) - return applyOp.getAffineMap().compose(ArrayRef{0})[0] % - vectorLength; + auto idxDefOp = innerMostIndex.getDefiningOp(); + if (!idxDefOp) + return 0L; + int64_t vectorLengthAlignmentOffset = + TypeSwitch(idxDefOp) + .Case([&](auto constantOp) { + return cast(constantOp.getValue()).getInt() % + vectorLength; + }) + .template Case([&](auto applyOp) { + if (applyOp.getAffineMap().getNumDims() == 1) + return applyOp.getAffineMap().compose(ArrayRef{0})[0] % + vectorLength; + return 0L; + }) + .Default([&](auto) { + // XXX: If we can't determine the offset, we assume the access is + // XXX: aligned. + return 0L; + }); + int64_t absoluteAlignmentOffset = alignment / getElementSizeInBits(vType); + if (vectorLengthAlignmentOffset % absoluteAlignmentOffset) + return vectorLengthAlignmentOffset; return 0; } @@ -64,6 +86,12 @@ struct SplitUnalignedTransferReadPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + SplitUnalignedTransferReadPattern(MLIRContext *context, int64_t minVectorSize, + int64_t maxVectorSize, int64_t alignment) + : OpConversionPattern(context), + minVectorSize(minVectorSize), maxVectorSize(maxVectorSize), + vectorAlignment(alignment) {} + LogicalResult matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -73,29 +101,47 @@ struct SplitUnalignedTransferReadPattern // Check if the transfer is unaligned. auto vType = readOp.getVectorType(); - unsigned offset = getTransferReadAlignmentOffset(adaptor, vType); + int64_t offset = + getTransferReadAlignmentOffset(adaptor, vType, vectorAlignment); if (offset == 0) return failure(); - // Create an aligned transfer read + // Verify that we can load a vector 2x as long as the original + auto vLen = vType.getShape().back(); + auto longVecTy = VectorType::get(2 * vLen, vType.getElementType()); + auto longVecSize = getElementSizeInBits(vType) * 2 * vLen; + if (longVecSize > maxVectorSize) + return failure(); // Calculate the aligned indices for the lower and higher parts. // TODO: Add support for cases where the offset is greater than the // TODO: vector length. - auto lowIdx = - dyn_cast(adaptor.getIndices().back().getDefiningOp()) - .getMapOperands()[0]; - auto vLen = vType.getShape().back(); - auto longVecTy = VectorType::get(2 * vLen, vType.getElementType()); + auto loc = readOp.getLoc(); + auto newInnerMostIdx = + TypeSwitch( + adaptor.getIndices().back().getDefiningOp()) + .Case( + [&](auto applyOp) { return applyOp.getMapOperands()[0]; }) + .Case([&](auto constantOp) { + auto cstValue = cast(constantOp.getValue()).getInt(); + auto newCstValue = cstValue - offset; + auto newConstantIdxOp = rewriter.create( + loc, + rewriter.getIntegerAttr(constantOp.getType(), newCstValue)); + return newConstantIdxOp.getResult(); + }) + .Default([&](auto) { + llvm_unreachable("Unexpected index type"); + return nullptr; + }); SmallVector alignedIdx; alignedIdx.append(adaptor.getIndices().begin(), adaptor.getIndices().end()); - alignedIdx[alignedIdx.size() - 1] = lowIdx; + alignedIdx[alignedIdx.size() - 1] = newInnerMostIdx; // Create the aligned transfer read for a vector 2x as long that covers the // elements of the unaligned vector. auto newReadOp = rewriter.create( - readOp.getLoc(), longVecTy, adaptor.getSource(), alignedIdx, - adaptor.getPadding()); + loc, longVecTy, adaptor.getSource(), alignedIdx, adaptor.getPadding()); // Create a `vector.extract_strided_slice` to extract the unaligned vector. rewriter.replaceOpWithNewOp( @@ -103,6 +149,10 @@ struct SplitUnalignedTransferReadPattern return success(); } + + int64_t minVectorSize; + int64_t maxVectorSize; + int64_t vectorAlignment; }; // This pattern converts a `vector.transfer_read` with a splat permutation map @@ -113,6 +163,9 @@ struct ConvertSplatTransferReadToBroadcastPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + ConvertSplatTransferReadToBroadcastPattern(MLIRContext *context) + : OpConversionPattern(context) {} + LogicalResult matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -163,16 +216,12 @@ struct ConvertSplatTransferReadToBroadcastPattern //============================================================================// //================ Common AIE canonicalization configuration =================// //============================================================================// - static void configureCommonAIECanonicalizeLegalizations(ConversionTarget &target) { - target.addLegalDialect(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); - target.addDynamicallyLegalOp( - [](vector::TransferReadOp op) { - return !op.getPermutationMap().isConstant(); - }); + target.addLegalDialect(); } static void @@ -186,27 +235,36 @@ populateCommonAIECanonicalizeConversionPatterns(RewritePatternSet &patterns) { //============================================================================// static void configureAIEv1CanonicalizeLegalizations(ConversionTarget &target) { - target.addLegalDialect(); target.addDynamicallyLegalOp( [](vector::TransferReadOp op) { return !op.getPermutationMap().isConstant() && - getTransferReadAlignmentOffset(op, op.getVectorType()) == 0; + getTransferReadAlignmentOffset(op, op.getVectorType(), 128) == 0; }); } static void populateAIEv1CanonicalizeConversionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext(), 128, + 512, 128); } //============================================================================// //============== AIEML-specific canonicalization configuration ===============// //============================================================================// -static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target) {} +static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target) { + target.addDynamicallyLegalOp( + [](vector::TransferReadOp op) { + return !op.getPermutationMap().isConstant() && + getTransferReadAlignmentOffset(op, op.getVectorType(), 256) == 0; + }); +} static void -populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns) {} +populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext(), 128, + 1024, 256); +} //============================================================================// //=================== Common AIE Canonicalization Passes =====================// diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir index 0a2c05c6f5..37db765e3d 100644 --- a/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir +++ b/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir @@ -2,37 +2,40 @@ func.func @conv2d(%arg0: memref<18x288xi16>, %arg1: memref<9xi16>, %arg2: memref<16x256xi16>) { %c0 = arith.constant 0 : index + %c2_i32 = arith.constant 2 : i32 + %c4_i32 = arith.constant 4 : i32 affine.for %arg3 = 0 to 16 { affine.for %arg4 = 0 to 256 step 16 { - %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16> + %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16> + %sbh = aievec.ext %0 {index = 0 : i8} : vector<32xi16>, vector<16xi16> + %sth = aievec.ext %0 {index = 1 : i8} : vector<32xi16>, vector<16xi16> %1 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<16xi16> %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<16xi16>, vector<16xi16> - %3 = arith.muli %0, %2 : vector<16xi16> - %4 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4) - %5 = aievec.upd %arg0[%arg3, %4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16> - %6 = aievec.broadcast %1 {idx = 1 : i8} : vector<16xi16>, vector<16xi16> - %7 = arith.muli %5, %6 : vector<16xi16> - %8 = arith.addi %3, %7 : vector<16xi16> - %9 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4) - %10 = aievec.upd %arg0[%arg3, %9] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16> - %11 = aievec.broadcast %1 {idx = 2 : i8} : vector<16xi16>, vector<16xi16> - %12 = arith.muli %10, %11 : vector<16xi16> - %13 = arith.addi %8, %12 : vector<16xi16> - vector.transfer_write %13, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16> + %3 = arith.muli %sbh, %2 : vector<16xi16> + %4 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16> + %5 = aievec.broadcast %1 {idx = 1 : i8} : vector<16xi16>, vector<16xi16> + %6 = arith.muli %4, %5 : vector<16xi16> + %7 = arith.addi %3, %6 : vector<16xi16> + %8 = aievec.shift %sbh, %sth, %c4_i32 {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16> + %9 = aievec.broadcast %1 {idx = 2 : i8} : vector<16xi16>, vector<16xi16> + %10 = arith.muli %8, %9 : vector<16xi16> + %11 = arith.addi %7, %10 : vector<16xi16> + vector.transfer_write %11, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16> } } return } -// CHECK-LABEL: func @conv2d -// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi16> -// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<9xi16> -// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi16> -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<32xi16> -// CHECK: affine.for %[[A3:.*]] = 0 to 16 { -// CHECK: affine.for %[[A4:.*]] = 0 to 256 step 16 { -// CHECK: %[[T1:.*]] = aievec.upd %[[A0:.*]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16> -// CHECK: %[[T2:.*]] = aievec.mul_conv %[[T1:.*]], %[[T0:.*]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64> -// CHECK: %[[T3:.*]] = aievec.srs %[[T2:.*]] {shift = 10 : i8} : vector<16xi64>, vector<16xi16> -// CHECK: vector.transfer_write %[[T3:.*]], %[[A2:.*]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16> +// CHECK-LABEL: func @conv2d +// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi16> +// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<9xi16> +// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi16> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<16xi16> +// CHECK: %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<16xi16>, vector<32xi16> +// CHECK: affine.for %[[A3:.*]] = 0 to 16 { +// CHECK: affine.for %[[A4:.*]] = 0 to 256 step 16 { +// CHECK: %[[T2:.*]] = aievec.upd %[[A0]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16> +// CHECK: %[[T3:.*]] = aievec.mul_conv %[[T2]], %[[T1]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64> +// CHECK: %[[T4:.*]] = aievec.srs %[[T3]] {shift = 10 : i8} : vector<16xi64>, vector<16xi16> +// CHECK: vector.transfer_write %[[T4]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16> diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir index 57a98d9a1e..4792fa7190 100644 --- a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir +++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir @@ -2,21 +2,23 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<16x256xi8>) { %c0 = arith.constant 0 : index + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 affine.for %arg3 = 0 to 16 { affine.for %arg4 = 0 to 256 step 32 { %0 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8> - %1 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> + %1 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> + %sbh = aievec.ext %1 {index = 0 : i8} : vector<64xi8>, vector<32xi8> + %sth = aievec.ext %1 {index = 1 : i8} : vector<64xi8>, vector<32xi8> %2 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> %3 = aievec.broadcast %2 {idx = 0 : i8} : vector<32xi8>, vector<32xi8> - %4 = arith.muli %1, %3 : vector<32xi8> + %4 = arith.muli %sbh, %3 : vector<32xi8> %5 = arith.addi %0, %4 : vector<32xi8> - %6 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4) - %7 = aievec.upd %arg0[%arg3, %6] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> + %7 = aievec.shift %sbh, %sth, %c1_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> %8 = aievec.broadcast %2 {idx = 2 : i8} : vector<32xi8>, vector<32xi8> %9 = arith.muli %7, %8 : vector<32xi8> %10 = arith.addi %5, %9 : vector<32xi8> - %11 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4) - %12 = aievec.upd %arg0[%arg3, %11] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> + %12 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> %13 = aievec.broadcast %2 {idx = 4 : i8} : vector<32xi8>, vector<32xi8> %14 = arith.muli %12, %13 : vector<32xi8> %15 = arith.addi %10, %14 : vector<32xi8> @@ -26,18 +28,19 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref< return } -// CHECK-LABEL: func @conv2d -// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8> -// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8> -// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8> -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8> -// CHECK: %[[T1:.*]] = aievec.shuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8> -// CHECK: affine.for %[[A3:.*]] = 0 to 16 { -// CHECK: affine.for %[[A4:.*]] = 0 to 256 step 32 { -// CHECK: %[[T2:.*]] = aievec.upd %[[A2]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8> -// CHECK: %[[T3:.*]] = aievec.upd %[[A0]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> -// CHECK: %[[T4:.*]] = aievec.ups %[[T2]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32> -// CHECK: %[[T5:.*]] = aievec.fma_conv %[[T3]], %[[T1]], %[[T4]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32> -// CHECK: %[[T6:.*]] = aievec.srs %[[T5]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8> -// CHECK: vector.transfer_write %[[T6]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> +// CHECK-LABEL: func @conv2d +// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8> +// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8> +// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> +// CHECK: %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8> +// CHECK: %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8> +// CHECK: affine.for %[[I:.*]] = 0 to 16 { +// CHECK: affine.for %[[J:.*]] = 0 to 256 step 32 { +// CHECK: %[[T3:.*]] = aievec.upd %[[A2]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8> +// CHECK: %[[T4:.*]] = aievec.upd %[[A0]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> +// CHECK: %[[T5:.*]] = aievec.ups %[[T3]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32> +// CHECK: %[[T6:.*]] = aievec.fma_conv %[[T4]], %[[T2]], %[[T5]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32> +// CHECK: %[[T7:.*]] = aievec.srs %[[T6]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8> +// CHECK: vector.transfer_write %[[T7]], %[[A2]][%[[I]], %[[J]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir index 5a5d4901f0..50f6d49a47 100644 --- a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir +++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir @@ -2,38 +2,41 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<16x256xi8>) { %c0 = arith.constant 0 : index + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 affine.for %arg3 = 0 to 16 { affine.for %arg4 = 0 to 256 step 32 { - %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> + %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> + %sbh = aievec.ext %0 {index = 0 : i8} : vector<64xi8>, vector<32xi8> + %sth = aievec.ext %0 {index = 1 : i8} : vector<64xi8>, vector<32xi8> %1 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<32xi8>, vector<32xi8> - %3 = arith.muli %0, %2 : vector<32xi8> - %4 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4) - %5 = aievec.upd %arg0[%arg3, %4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> - %6 = aievec.broadcast %1 {idx = 2 : i8} : vector<32xi8>, vector<32xi8> - %7 = arith.muli %5, %6 : vector<32xi8> - %8 = arith.addi %3, %7 : vector<32xi8> - %9 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4) - %10 = aievec.upd %arg0[%arg3, %9] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8> - %11 = aievec.broadcast %1 {idx = 4 : i8} : vector<32xi8>, vector<32xi8> - %12 = arith.muli %10, %11 : vector<32xi8> - %13 = arith.addi %8, %12 : vector<32xi8> - vector.transfer_write %13, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> + %3 = arith.muli %sbh, %2 : vector<32xi8> + %4 = aievec.shift %sbh, %sth, %c1_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> + %5 = aievec.broadcast %1 {idx = 2 : i8} : vector<32xi8>, vector<32xi8> + %6 = arith.muli %4, %5 : vector<32xi8> + %7 = arith.addi %3, %6 : vector<32xi8> + %8 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> + %9 = aievec.broadcast %1 {idx = 4 : i8} : vector<32xi8>, vector<32xi8> + %10 = arith.muli %8, %9 : vector<32xi8> + %11 = arith.addi %7, %10 : vector<32xi8> + vector.transfer_write %11, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> } } return } -// CHECK-LABEL: func @conv2d -// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8> -// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8> -// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8> -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8> -// CHECK: %[[T1:.*]] = aievec.shuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8> -// CHECK: affine.for %[[A3:.*]] = 0 to 16 { -// CHECK: affine.for %[[A4:.*]] = 0 to 256 step 32 { -// CHECK: %[[T2:.*]] = aievec.upd %[[A0:.*]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> -// CHECK: %[[T3:.*]] = aievec.mul_conv %[[T2:.*]], %[[T1:.*]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32> -// CHECK: %[[T4:.*]] = aievec.srs %[[T3:.*]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8> -// CHECK: vector.transfer_write %[[T4:.*]], %[[A2:.*]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> +// CHECK-LABEL: func @conv2d +// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8> +// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8> +// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> +// CHECK: %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8> +// CHECK: %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8> +// CHECK: affine.for %[[I:.*]] = 0 to 16 { +// CHECK: affine.for %[[J:.*]] = 0 to 256 step 32 { +// CHECK: %[[T3:.*]] = aievec.upd %[[A0]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8> +// CHECK: %[[T4:.*]] = aievec.mul_conv %[[T3]], %[[T2]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32> +// CHECK: %[[T5:.*]] = aievec.srs %[[T4]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8> +// CHECK: vector.transfer_write %[[T5]], %[[A2]][%[[I]], %[[J]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8> diff --git a/test/Conversion/VectorToAIEVec/test-upd.mlir b/test/Conversion/VectorToAIEVec/test-upd.mlir index a246394108..ada3619e6f 100644 --- a/test/Conversion/VectorToAIEVec/test-upd.mlir +++ b/test/Conversion/VectorToAIEVec/test-upd.mlir @@ -1,49 +1,66 @@ -// RUN: aie-opt %s --convert-vector-to-aievec | FileCheck %s -// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" | FileCheck %s --check-prefix=CHECK-V2 +// RUN: aie-opt %s --convert-vector-to-aievec -split-input-file | FileCheck %s +// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK-V2 +// CHECK-V2-LABEL: func @veccopy_i8 func.func @veccopy_i8(%arg0: memref<256xi8>, %arg1: memref<256xi8>) { %c0_i8 = arith.constant 0 : i8 affine.for %arg2 = 0 to 256 step 16 { - // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi8>, vector<16xi8> + // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi8>, vector<16xi8> %0 = vector.transfer_read %arg0[%arg2], %c0_i8 : memref<256xi8>, vector<16xi8> - // CHECK: vector.transfer_write %[[LD]], {{.*}} + // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} vector.transfer_write %0, %arg1[%arg2] : vector<16xi8>, memref<256xi8> } return } +// ----- + +// CHECK-LABEL: func @veccopy_i16 +// CHECK-V2-LABEL: func @veccopy_i16 func.func @veccopy_i16(%arg0: memref<256xi16>, %arg1: memref<256xi16>) { %c0_i16 = arith.constant 0 : i16 affine.for %arg2 = 0 to 256 step 16 { // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi16>, vector<16xi16> + // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi16>, vector<16xi16> %0 = vector.transfer_read %arg0[%arg2], %c0_i16 : memref<256xi16>, vector<16xi16> // CHECK: vector.transfer_write %[[LD]], {{.*}} + // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} vector.transfer_write %0, %arg1[%arg2] : vector<16xi16>, memref<256xi16> } return } +// ----- + +// CHECK-LABEL: func @veccopy_i32 +// CHECK-V2-LABEL: func @veccopy_i32 func.func @veccopy_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) { %c0_i32 = arith.constant 0 : i32 - affine.for %arg2 = 0 to 256 step 16 { - // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32> - %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<16xi32> + affine.for %arg2 = 0 to 256 step 8 { + // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<8xi32> + // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<8xi32> + %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<8xi32> // CHECK: vector.transfer_write %[[LD]], {{.*}} - vector.transfer_write %0, %arg1[%arg2] : vector<16xi32>, memref<256xi32> + // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} + vector.transfer_write %0, %arg1[%arg2] : vector<8xi32>, memref<256xi32> } return } +// ----- + +// CHECK-LABEL: func @veccopy_long_i32 +// CHECK-V2-LABEL: func @veccopy_long_i32 func.func @veccopy_long_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) { %c0_i32 = arith.constant 0 : i32 - affine.for %arg2 = 0 to 256 step 32 { - // CHECK: %[[LD0:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<32xi32> - // CHECK-NEXT: %[[LD1:.*]] = aievec.upd {{.*}}, %[[LD0]] {index = 1 : i8, offset = 512 : si32} : memref<256xi32>, vector<32xi32> + affine.for %arg2 = 0 to 256 step 16 { + // CHECK: %[[LD0:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32> + // CHECK-NEXT: %[[LD1:.*]] = aievec.upd {{.*}}, %[[LD0]] {index = 1 : i8, offset = 256 : si32} : memref<256xi32>, vector<16xi32> // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32> - %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<32xi32> + %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<16xi32> // CHECK: vector.transfer_write %[[LD1]], {{.*}} // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} - vector.transfer_write %0, %arg1[%arg2] : vector<32xi32>, memref<256xi32> + vector.transfer_write %0, %arg1[%arg2] : vector<16xi32>, memref<256xi32> } return } diff --git a/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir b/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir new file mode 100644 index 0000000000..8774b1f4bc --- /dev/null +++ b/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir @@ -0,0 +1,25 @@ +// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK +func.func @unaligned_read(%a: memref<48xi8>) -> (vector<32xi8>, vector<32xi8>) { + %c0_i8 = arith.constant 0 : i8 + %c16 = arith.constant 16 : index + %c34 = arith.constant 34 : index + %0 = vector.transfer_read %a[%c16], %c0_i8 : memref<48xi8>, vector<32xi8> + %1 = vector.transfer_read %a[%c34], %c0_i8 : memref<48xi8>, vector<32xi8> + return %0, %1 : vector<32xi8>, vector<32xi8> +} + +// CHECK-LABEL: func @unaligned_read +// CHECK: %[[C2i32:.*]] = arith.constant 2 : i32 +// CHECK: %[[C32:.*]] = arith.constant 32 : index +// CHECK: %[[C16i32:.*]] = arith.constant 16 : i32 +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[T0:.*]] = aievec.upd {{.*}}[%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8> +// CHECK: %[[T0E0:.*]] = aievec.ext %[[T0]] {index = 0 : i8} : vector<64xi8>, vector<32xi8> +// CHECK: %[[T0E1:.*]] = aievec.ext %[[T0]] {index = 1 : i8} : vector<64xi8>, vector<32xi8> +// CHECK: %[[R0:.*]] = aievec.shift %[[T0E0]], %[[T0E1]], %[[C16i32]] {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> +// CHECK: %[[T1:.*]] = aievec.upd {{.*}}[%[[C32:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8> +// CHECK: %[[T1E0:.*]] = aievec.ext %[[T1]] {index = 0 : i8} : vector<64xi8>, vector<32xi8> +// CHECK: %[[T1E1:.*]] = aievec.ext %[[T1]] {index = 1 : i8} : vector<64xi8>, vector<32xi8> +// CHECK: %[[R1:.*]] = aievec.shift %[[T1E0]], %[[T1E1]], %[[C2i32]] {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8> +// CHECK: return %[[R0:.*]], %[[R1:.*]] : vector<32xi8>, vector<32xi8> + diff --git a/test/Conversion/VectorToAIEVec/unaligned-load.mlir b/test/Conversion/VectorToAIEVec/unaligned-load.mlir index d547c0f9ac..2e4f8e4503 100644 --- a/test/Conversion/VectorToAIEVec/unaligned-load.mlir +++ b/test/Conversion/VectorToAIEVec/unaligned-load.mlir @@ -1,20 +1,70 @@ -// RUN: aie-opt %s --convert-vector-to-aievec | FileCheck %s -func.func @unaligned_read(%a: memref<48xi8>) -> (vector<32xi8>, vector<32xi8>) { - %c0_i8 = arith.constant 0 : i8 - %c16 = arith.constant 16 : index - %c34 = arith.constant 34 : index - %0 = vector.transfer_read %a[%c16], %c0_i8 : memref<48xi8>, vector<32xi8> - %1 = vector.transfer_read %a[%c34], %c0_i8 : memref<48xi8>, vector<32xi8> - return %0, %1 : vector<32xi8>, vector<32xi8> +// RUN: aie-opt %s --convert-vector-to-aievec -split-input-file | FileCheck %s +// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK-V2 + +// CHECK-LABEL: func @unaligned_read +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[V0B:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi32>, vector<16xi32> +// CHECK: %[[V0T:.*]] = aievec.upd %{{.*}}[%[[C0]]], %[[V0B]] {index = 1 : i8, offset = 256 : si32} : memref<64xi32>, vector<16xi32> +// CHECK: %[[V0ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x76543210", xsquare = "0x3210", xstart = "3", +// CHECK-SAME: yoffsets = "0", ysquare = "0", ystart = "0"} +// CHECK-SAME: : vector<16xi32>, vector<16xi32> +// CHECK: %[[V0:.*]] = aievec.ext %[[V0ROT]] {index = 0 : i8} : vector<16xi32>, vector<8xi32> +// CHECK: %[[V1ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x76543210", xsquare = "0x3210", xstart = "6", +// CHECK-SAME: yoffsets = "0", ysquare = "0", ystart = "0"} +// CHECK-SAME: : vector<16xi32>, vector<16xi32> +// CHECK: %[[V1:.*]] = aievec.ext %[[V1ROT]] {index = 0 : i8} : vector<16xi32>, vector<8xi32> +// CHECK: return %[[V0]], %[[V1]] : vector<8xi32>, vector<8xi32> + +// CHECK-V2-LABEL: func @unaligned_read +// CHECK-V2: %[[C24i32:.*]] = arith.constant 24 : i32 +// CHECK-V2: %[[C12i32:.*]] = arith.constant 12 : i32 +// CHECK-V2: %[[C0:.*]] = arith.constant 0 : index +// CHECK-V2: %[[LV:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi32>, vector<16xi32> +// CHECK-V2: %[[LV0:.*]] = aievec.ext %[[LV]] {index = 0 : i8} : vector<16xi32>, vector<8xi32> +// CHECK-V2: %[[LV1:.*]] = aievec.ext %[[LV]] {index = 1 : i8} : vector<16xi32>, vector<8xi32> +// CHECK-V2: %[[R0:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C12i32]] {isAcc = false} : vector<8xi32>, vector<8xi32>, i32, vector<8xi32> +// CHECK-V2: %[[R1:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C24i32]] {isAcc = false} : vector<8xi32>, vector<8xi32>, i32, vector<8xi32> +// CHECK-V2: return %[[R0]], %[[R1]] : vector<8xi32>, vector<8xi32> +func.func @unaligned_read(%m: memref<64xi32>) -> (vector<8xi32>, vector<8xi32>) { + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %0 = vector.transfer_read %m[%c3], %c0_i32 : memref<64xi32>, vector<8xi32> + %1 = vector.transfer_read %m[%c6], %c0_i32 : memref<64xi32>, vector<8xi32> + return %0, %1 : vector<8xi32>, vector<8xi32> } +// ----- + // CHECK-LABEL: func @unaligned_read -// CHECK : %[[C64:.*]] = arith.constant 64 : index -// CHECK : %[[C32:.*]] = arith.constant 32 : index -// CHECK : %[[C0:.*]] = arith.constant 0 : index -// CHECK : %[[T0:.*]] = aievec.upd {{.*}}[%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> -// CHECK : %[[T1:.*]] = aievec.upd {{.*}}[%[[C32:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> -// CHECK : %[[T2:.*]] = aievec.shift %[[T0:.*]], %[[T1:.*]] {shift = 16 : i32} : vector<32xi8>, vector<32xi8> -// CHECK : %[[T3:.*]] = aievec.upd {{.*}}[%[[C64:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8> -// CHECK : %[[T4:.*]] = aievec.shift %[[T1:.*]], %[[T3:.*]] {shift = 2 : i32} : vector<32xi8>, vector<32xi8> -// CHECK : return %[[T2:.*]], %[[T4:.*]] : vector<32xi8>, vector<32xi8> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[V0B:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi16>, vector<32xi16> +// CHECK: %[[V0T:.*]] = aievec.upd %{{.*}}[%[[C0]]], %[[V0B]] {index = 1 : i8, offset = 256 : si32} : memref<64xi16>, vector<32xi16> +// CHECK: %[[V0ROT:.*]] = aievec.select %[[V0T]] {select = "0x11111111", xoffsets = "0x06040200", xoffsets_hi = "0x0e0c0a08", xsquare = "0x2103", xstart = "4", +// CHECK-SAME: yoffsets = "0x0503010f", yoffsets_hi = "0x0d0b0907", ysquare = "0x2103", ystart = "2"} +// CHECK-SAME: : vector<32xi16>, vector<32xi16> +// CHECK: %[[V0:.*]] = aievec.ext %[[V0ROT]] {index = 0 : i8} : vector<32xi16>, vector<16xi16> +// CHECK: %[[V1ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x06040200", xoffsets_hi = "0x0e0c0a08", xsquare = "0x3210", xstart = "6", +// CHECK-SAME: yoffsets = "0", yoffsets_hi = "0", ysquare = "0", ystart = "0"} +// CHECK-SAME: : vector<32xi16>, vector<32xi16> +// CHECK: %[[V1:.*]] = aievec.ext %[[V1ROT]] {index = 0 : i8} : vector<32xi16>, vector<16xi16> +// CHECK: return %[[V0]], %[[V1]] : vector<16xi16>, vector<16xi16> + +// CHECK-V2-LABEL: func @unaligned_read +// CHECK-V2: %[[C12i32:.*]] = arith.constant 12 : i32 +// CHECK-V2: %[[C6i32:.*]] = arith.constant 6 : i32 +// CHECK-V2: %[[C0:.*]] = arith.constant 0 : index +// CHECK-V2: %[[LV:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi16>, vector<32xi16> +// CHECK-V2: %[[LV0:.*]] = aievec.ext %[[LV]] {index = 0 : i8} : vector<32xi16>, vector<16xi16> +// CHECK-V2: %[[LV1:.*]] = aievec.ext %[[LV]] {index = 1 : i8} : vector<32xi16>, vector<16xi16> +// CHECK-V2: %[[R0:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C6i32]] {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16> +// CHECK-V2: %[[R1:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C12i32]] {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16> +// CHECK-V2: return %[[R0]], %[[R1]] : vector<16xi16>, vector<16xi16> +func.func @unaligned_read(%m: memref<64xi16>) -> (vector<16xi16>, vector<16xi16>) { + %c0_i16 = arith.constant 0 : i16 + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %0 = vector.transfer_read %m[%c3], %c0_i16 : memref<64xi16>, vector<16xi16> + %1 = vector.transfer_read %m[%c6], %c0_i16 : memref<64xi16>, vector<16xi16> + return %0, %1 : vector<16xi16>, vector<16xi16> +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc new file mode 100644 index 0000000000..c1347a4848 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc @@ -0,0 +1,48 @@ +#include "aie_api/aie.hpp" +#include "aie_api/utils.hpp" +#include +#include + +template const char *tid() { return "@"; } + +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } + +template void printv(vtype v) { + printf("vector<%dx%s%u>[ ", nlanes, tid(), 8 * sizeof(elemtype)); + aie::print(aie::vector(v)); + printf("]\n"); +} + +void printv16xi32(v16int32 v) { printv<16, int32_t>(v); } + +void printv8xi32(v8int32 v) { printv<8, int32_t>(v); } + +void printv32xi16(v32int16 v) { printv<32, int16_t>(v); } + +void printv16xi16(v16int16 v) { printv<16, int16_t>(v); } + +void printv32xi8(v32int8 v) { printv<32, int8_t>(v); } + +alignas(32) int32_t buff_i32[64]; +alignas(32) int16_t buff_i16[64]; +alignas(32) int8_t buff_i8[64]; + +int32_t *loadA64xi32() { + for (int i = 0; i < 64; ++i) + buff_i32[i] = i; + return buff_i32; +} + +int16_t *loadA64xi16() { + for (int i = 0; i < 64; ++i) + buff_i16[i] = i; + return buff_i16; +} + +int8_t *loadA64xi8() { + for (int i = 0; i < 64; ++i) + buff_i8[i] = i; + return buff_i8; +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir new file mode 100644 index 0000000000..0834252631 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir @@ -0,0 +1,81 @@ +// REQUIRES: valid_xchess_license +// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc +// RUN: echo "#include " > kernel.cc && cat kernel.tmp.cc >> kernel.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s + +func.func private @printv32xi16(%v : vector<32xi16>) +func.func private @loadA64xi16() -> memref<64xi16> + +#map6 = affine_map<(d0) -> (d0 + 6)> +#map7 = affine_map<(d0) -> (d0 + 7)> +#map8 = affine_map<(d0) -> (d0 + 8)> +#map9 = affine_map<(d0) -> (d0 + 9)> +#map10 = affine_map<(d0) -> (d0 + 10)> + +func.func @entry() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c0_i16 = arith.constant 0 : i16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c8 = arith.constant 8 : index + %c9 = arith.constant 9 : index + %c10 = arith.constant 10 : index + %c11 = arith.constant 11 : index + %c12 = arith.constant 12 : index + %c13 = arith.constant 13 : index + %c14 = arith.constant 14 : index + %c15 = arith.constant 15 : index + + %buffi16 = func.call @loadA64xi16() : () -> (memref<64xi16>) + %v16 = vector.transfer_read %buffi16[%c0], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%v16) : (vector<32xi16>) -> () + + %1 = vector.transfer_read %buffi16[%c1], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%1) : (vector<32xi16>) -> () + %2 = vector.transfer_read %buffi16[%c2], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%2) : (vector<32xi16>) -> () + %3 = vector.transfer_read %buffi16[%c3], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%3) : (vector<32xi16>) -> () + %4 = vector.transfer_read %buffi16[%c4], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%4) : (vector<32xi16>) -> () + %5 = vector.transfer_read %buffi16[%c5], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%5) : (vector<32xi16>) -> () + + %i6 = affine.apply #map6(%c0) + %6 = vector.transfer_read %buffi16[%i6], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%6) : (vector<32xi16>) -> () + %i7 = affine.apply #map7(%c0) + %7 = vector.transfer_read %buffi16[%i7], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%7) : (vector<32xi16>) -> () + %i8 = affine.apply #map8(%c0) + %8 = vector.transfer_read %buffi16[%i8], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%8) : (vector<32xi16>) -> () + %i9 = affine.apply #map9(%c0) + %9 = vector.transfer_read %buffi16[%i9], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%9) : (vector<32xi16>) -> () + %i10 = affine.apply #map10(%c0) + %10 = vector.transfer_read %buffi16[%i10], %c0_i16 : memref<64xi16>, vector<32xi16> + func.call @printv32xi16(%10) : (vector<32xi16>) -> () + + return %c0_i32 : i32 +} + +// CHECK-LABEL: vector<32xi16>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ] +// CHECK-LABEL: vector<32xi16>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 ] +// CHECK-LABEL: vector<32xi16>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 ] +// CHECK-LABEL: vector<32xi16>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 ] +// CHECK-LABEL: vector<32xi16>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ] +// CHECK-LABEL: vector<32xi16>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 ] +// CHECK-LABEL: vector<32xi16>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 ] +// CHECK-LABEL: vector<32xi16>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 ] +// CHECK-LABEL: vector<32xi16>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ] +// CHECK-LABEL: vector<32xi16>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 ] +// CHECK-LABEL: vector<32xi16>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 ] +// CHECK-LABEL: SUCCESS diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc new file mode 100644 index 0000000000..a81ee4f46c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc @@ -0,0 +1,11 @@ +#include + +int entry(void); + +int main(void) { + int r = entry(); + if (r) + printf("ERROR: %d", r); + printf("SUCCESS"); + return r; +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc new file mode 100644 index 0000000000..c1347a4848 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc @@ -0,0 +1,48 @@ +#include "aie_api/aie.hpp" +#include "aie_api/utils.hpp" +#include +#include + +template const char *tid() { return "@"; } + +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } + +template void printv(vtype v) { + printf("vector<%dx%s%u>[ ", nlanes, tid(), 8 * sizeof(elemtype)); + aie::print(aie::vector(v)); + printf("]\n"); +} + +void printv16xi32(v16int32 v) { printv<16, int32_t>(v); } + +void printv8xi32(v8int32 v) { printv<8, int32_t>(v); } + +void printv32xi16(v32int16 v) { printv<32, int16_t>(v); } + +void printv16xi16(v16int16 v) { printv<16, int16_t>(v); } + +void printv32xi8(v32int8 v) { printv<32, int8_t>(v); } + +alignas(32) int32_t buff_i32[64]; +alignas(32) int16_t buff_i16[64]; +alignas(32) int8_t buff_i8[64]; + +int32_t *loadA64xi32() { + for (int i = 0; i < 64; ++i) + buff_i32[i] = i; + return buff_i32; +} + +int16_t *loadA64xi16() { + for (int i = 0; i < 64; ++i) + buff_i16[i] = i; + return buff_i16; +} + +int8_t *loadA64xi8() { + for (int i = 0; i < 64; ++i) + buff_i8[i] = i; + return buff_i8; +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir new file mode 100644 index 0000000000..a3ea4b8cc8 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir @@ -0,0 +1,35 @@ +// REQUIRES: valid_xchess_license +// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc +// RUN: echo "#include " > kernel.cc && cat kernel.tmp.cc >> kernel.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s + +func.func private @printv16xi32(%v : vector<16xi32>) +func.func private @loadA64xi32() -> memref<64xi32> + +#map6 = affine_map<(d0) -> (d0 + 6)> + +func.func @entry() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c5 = arith.constant 5 : index + + %buffi32 = func.call @loadA64xi32() : () -> (memref<64xi32>) + + %v0 = vector.transfer_read %buffi32[%c0], %c0_i32 : memref<64xi32>, vector<16xi32> + func.call @printv16xi32(%v0) : (vector<16xi32>) -> () + + %v5 = vector.transfer_read %buffi32[%c5], %c0_i32 : memref<64xi32>, vector<16xi32> + func.call @printv16xi32(%v5) : (vector<16xi32>) -> () + + %idx6 = affine.apply #map6(%c0) + %v6 = vector.transfer_read %buffi32[%idx6], %c0_i32 : memref<64xi32>, vector<16xi32> + func.call @printv16xi32(%v6) : (vector<16xi32>) -> () + + return %c0_i32 : i32 +} + +// CHECK-LABEL: vector<16xi32>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ] +// CHECK-LABEL: vector<16xi32>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ] +// CHECK-LABEL: vector<16xi32>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ] +// CHECK-LABEL: SUCCESS diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc new file mode 100644 index 0000000000..a81ee4f46c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc @@ -0,0 +1,11 @@ +#include + +int entry(void); + +int main(void) { + int r = entry(); + if (r) + printf("ERROR: %d", r); + printf("SUCCESS"); + return r; +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc new file mode 100644 index 0000000000..4769c84910 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc @@ -0,0 +1,50 @@ +#include "aie_api/aie.hpp" +#include "aie_api/utils.hpp" +#include +#include + +template const char *tid() { return "@"; } + +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } + +template void printv(vtype v) { + printf("vector<%dx%s%u>[ ", nlanes, tid(), 8 * sizeof(elemtype)); + aie::print(aie::vector(v)); + printf("]\n"); +} + +void printv16xi32(v16int32 v) { printv<16, int32_t>(v); } + +void printv8xi32(v8int32 v) { printv<8, int32_t>(v); } + +void printv32xi16(v32int16 v) { printv<32, int16_t>(v); } + +void printv16xi16(v16int16 v) { printv<16, int16_t>(v); } + +void printv32xi8(v32int8 v) { printv<32, int8_t>(v); } + +void printv64xi8(v64int8 v) { printv<64, int8_t>(v); } + +alignas(32) int32_t buff_i32[64]; +alignas(32) int16_t buff_i16[64]; +alignas(32) int8_t buff_i8[128]; + +int32_t *loadA64xi32() { + for (int i = 0; i < 64; ++i) + buff_i32[i] = i; + return buff_i32; +} + +int16_t *loadA64xi16() { + for (int i = 0; i < 64; ++i) + buff_i16[i] = i; + return buff_i16; +} + +int8_t *loadA128xi8() { + for (int i = 0; i < 128; ++i) + buff_i8[i] = i; + return buff_i8; +} diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir new file mode 100644 index 0000000000..f7f445e360 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir @@ -0,0 +1,81 @@ +// REQUIRES: valid_xchess_license +// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc +// RUN: echo "#include " > kernel.cc && cat kernel.tmp.cc >> kernel.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s + +func.func private @printv64xi8(%v : vector<64xi8>) +func.func private @loadA128xi8() -> memref<128xi8> + +#map6 = affine_map<(d0) -> (d0 + 6)> +#map7 = affine_map<(d0) -> (d0 + 7)> +#map8 = affine_map<(d0) -> (d0 + 8)> +#map9 = affine_map<(d0) -> (d0 + 9)> +#map10 = affine_map<(d0) -> (d0 + 10)> + +func.func @entry() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c0_i8 = arith.constant 0 : i8 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c8 = arith.constant 8 : index + %c9 = arith.constant 9 : index + %c10 = arith.constant 10 : index + %c11 = arith.constant 11 : index + %c12 = arith.constant 12 : index + %c13 = arith.constant 13 : index + %c14 = arith.constant 14 : index + %c15 = arith.constant 15 : index + + %buffi8 = func.call @loadA128xi8() : () -> (memref<128xi8>) + %v16 = vector.transfer_read %buffi8[%c0], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%v16) : (vector<64xi8>) -> () + + %1 = vector.transfer_read %buffi8[%c1], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%1) : (vector<64xi8>) -> () + %2 = vector.transfer_read %buffi8[%c2], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%2) : (vector<64xi8>) -> () + %3 = vector.transfer_read %buffi8[%c3], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%3) : (vector<64xi8>) -> () + %4 = vector.transfer_read %buffi8[%c4], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%4) : (vector<64xi8>) -> () + %5 = vector.transfer_read %buffi8[%c5], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%5) : (vector<64xi8>) -> () + + %i6 = affine.apply #map6(%c0) + %6 = vector.transfer_read %buffi8[%i6], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%6) : (vector<64xi8>) -> () + %i7 = affine.apply #map7(%c0) + %7 = vector.transfer_read %buffi8[%i7], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%7) : (vector<64xi8>) -> () + %i8 = affine.apply #map8(%c0) + %8 = vector.transfer_read %buffi8[%i8], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%8) : (vector<64xi8>) -> () + %i9 = affine.apply #map9(%c0) + %9 = vector.transfer_read %buffi8[%i9], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%9) : (vector<64xi8>) -> () + %i10 = affine.apply #map10(%c0) + %10 = vector.transfer_read %buffi8[%i10], %c0_i8 : memref<128xi8>, vector<64xi8> + func.call @printv64xi8(%10) : (vector<64xi8>) -> () + + return %c0_i32 : i32 +} + +// CHECK-LABEL: vector<64xi8>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 ] +// CHECK-LABEL: vector<64xi8>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 ] +// CHECK-LABEL: vector<64xi8>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 ] +// CHECK-LABEL: vector<64xi8>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 ] +// CHECK-LABEL: vector<64xi8>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 ] +// CHECK-LABEL: vector<64xi8>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 ] +// CHECK-LABEL: vector<64xi8>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 ] +// CHECK-LABEL: vector<64xi8>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 ] +// CHECK-LABEL: vector<64xi8>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 ] +// CHECK-LABEL: vector<64xi8>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 ] +// CHECK-LABEL: vector<64xi8>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 ] +// CHECK-LABEL: SUCCESS diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc new file mode 100644 index 0000000000..a81ee4f46c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc @@ -0,0 +1,11 @@ +#include + +int entry(void); + +int main(void) { + int r = entry(); + if (r) + printf("ERROR: %d", r); + printf("SUCCESS"); + return r; +} diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc new file mode 100644 index 0000000000..c1347a4848 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc @@ -0,0 +1,48 @@ +#include "aie_api/aie.hpp" +#include "aie_api/utils.hpp" +#include +#include + +template const char *tid() { return "@"; } + +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } + +template void printv(vtype v) { + printf("vector<%dx%s%u>[ ", nlanes, tid(), 8 * sizeof(elemtype)); + aie::print(aie::vector(v)); + printf("]\n"); +} + +void printv16xi32(v16int32 v) { printv<16, int32_t>(v); } + +void printv8xi32(v8int32 v) { printv<8, int32_t>(v); } + +void printv32xi16(v32int16 v) { printv<32, int16_t>(v); } + +void printv16xi16(v16int16 v) { printv<16, int16_t>(v); } + +void printv32xi8(v32int8 v) { printv<32, int8_t>(v); } + +alignas(32) int32_t buff_i32[64]; +alignas(32) int16_t buff_i16[64]; +alignas(32) int8_t buff_i8[64]; + +int32_t *loadA64xi32() { + for (int i = 0; i < 64; ++i) + buff_i32[i] = i; + return buff_i32; +} + +int16_t *loadA64xi16() { + for (int i = 0; i < 64; ++i) + buff_i16[i] = i; + return buff_i16; +} + +int8_t *loadA64xi8() { + for (int i = 0; i < 64; ++i) + buff_i8[i] = i; + return buff_i8; +} diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir new file mode 100644 index 0000000000..6cd644028c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir @@ -0,0 +1,81 @@ +// REQUIRES: valid_xchess_license +// RUN: aie-opt %s -convert-vector-to-aievec | aie-translate -aievec-to-cpp -o kernel.tmp.cc +// RUN: echo "#include " > kernel.cc && cat kernel.tmp.cc >> kernel.cc +// RUN: xchesscc_wrapper aie -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ kernel.cc %S/helplib.cc %S/main.cc +// RUN: xca_udm_dbg -qf -T -P %aietools/data/versal_prod/lib -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s + +func.func private @printv16xi16(%v : vector<16xi16>) +func.func private @loadA64xi16() -> memref<64xi16> + +#map6 = affine_map<(d0) -> (d0 + 6)> +#map7 = affine_map<(d0) -> (d0 + 7)> +#map8 = affine_map<(d0) -> (d0 + 8)> +#map9 = affine_map<(d0) -> (d0 + 9)> +#map10 = affine_map<(d0) -> (d0 + 10)> + +func.func @entry() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c0_i16 = arith.constant 0 : i16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c8 = arith.constant 8 : index + %c9 = arith.constant 9 : index + %c10 = arith.constant 10 : index + %c11 = arith.constant 11 : index + %c12 = arith.constant 12 : index + %c13 = arith.constant 13 : index + %c14 = arith.constant 14 : index + %c15 = arith.constant 15 : index + + %buffi16 = func.call @loadA64xi16() : () -> (memref<64xi16>) + %v16 = vector.transfer_read %buffi16[%c0], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%v16) : (vector<16xi16>) -> () + + %1 = vector.transfer_read %buffi16[%c1], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%1) : (vector<16xi16>) -> () + %2 = vector.transfer_read %buffi16[%c2], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%2) : (vector<16xi16>) -> () + %3 = vector.transfer_read %buffi16[%c3], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%3) : (vector<16xi16>) -> () + %4 = vector.transfer_read %buffi16[%c4], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%4) : (vector<16xi16>) -> () + %5 = vector.transfer_read %buffi16[%c5], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%5) : (vector<16xi16>) -> () + + %i6 = affine.apply #map6(%c0) + %6 = vector.transfer_read %buffi16[%i6], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%6) : (vector<16xi16>) -> () + %i7 = affine.apply #map7(%c0) + %7 = vector.transfer_read %buffi16[%i7], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%7) : (vector<16xi16>) -> () + %i8 = affine.apply #map8(%c0) + %8 = vector.transfer_read %buffi16[%i8], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%8) : (vector<16xi16>) -> () + %i9 = affine.apply #map9(%c0) + %9 = vector.transfer_read %buffi16[%i9], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%9) : (vector<16xi16>) -> () + %i10 = affine.apply #map10(%c0) + %10 = vector.transfer_read %buffi16[%i10], %c0_i16 : memref<64xi16>, vector<16xi16> + func.call @printv16xi16(%10) : (vector<16xi16>) -> () + + return %c0_i32 : i32 +} + +// CHECK-LABEL: vector<16xi16>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ] +// CHECK-LABEL: vector<16xi16>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ] +// CHECK-LABEL: vector<16xi16>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ] +// CHECK-LABEL: vector<16xi16>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ] +// CHECK-LABEL: vector<16xi16>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ] +// CHECK-LABEL: vector<16xi16>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ] +// CHECK-LABEL: vector<16xi16>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ] +// CHECK-LABEL: vector<16xi16>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 ] +// CHECK-LABEL: vector<16xi16>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ] +// CHECK-LABEL: vector<16xi16>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ] +// CHECK-LABEL: vector<16xi16>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ] +// CHECK-LABEL: SUCCESS diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc new file mode 100644 index 0000000000..a81ee4f46c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc @@ -0,0 +1,11 @@ +#include + +int entry(void); + +int main(void) { + int r = entry(); + if (r) + printf("ERROR: %d", r); + printf("SUCCESS"); + return r; +} diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc new file mode 100644 index 0000000000..c1347a4848 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc @@ -0,0 +1,48 @@ +#include "aie_api/aie.hpp" +#include "aie_api/utils.hpp" +#include +#include + +template const char *tid() { return "@"; } + +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } +template <> const char *tid() { return "i"; } + +template void printv(vtype v) { + printf("vector<%dx%s%u>[ ", nlanes, tid(), 8 * sizeof(elemtype)); + aie::print(aie::vector(v)); + printf("]\n"); +} + +void printv16xi32(v16int32 v) { printv<16, int32_t>(v); } + +void printv8xi32(v8int32 v) { printv<8, int32_t>(v); } + +void printv32xi16(v32int16 v) { printv<32, int16_t>(v); } + +void printv16xi16(v16int16 v) { printv<16, int16_t>(v); } + +void printv32xi8(v32int8 v) { printv<32, int8_t>(v); } + +alignas(32) int32_t buff_i32[64]; +alignas(32) int16_t buff_i16[64]; +alignas(32) int8_t buff_i8[64]; + +int32_t *loadA64xi32() { + for (int i = 0; i < 64; ++i) + buff_i32[i] = i; + return buff_i32; +} + +int16_t *loadA64xi16() { + for (int i = 0; i < 64; ++i) + buff_i16[i] = i; + return buff_i16; +} + +int8_t *loadA64xi8() { + for (int i = 0; i < 64; ++i) + buff_i8[i] = i; + return buff_i8; +} diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir new file mode 100644 index 0000000000..670e5c73f1 --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir @@ -0,0 +1,35 @@ +// REQUIRES: valid_xchess_license +// RUN: aie-opt %s -convert-vector-to-aievec | aie-translate -aievec-to-cpp -o kernel.tmp.cc +// RUN: echo "#include " > kernel.cc && cat kernel.tmp.cc >> kernel.cc +// RUN: xchesscc_wrapper aie -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ kernel.cc %S/helplib.cc %S/main.cc +// RUN: xca_udm_dbg -qf -T -P %aietools/data/versal_prod/lib -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s + +func.func private @printv8xi32(%v : vector<8xi32>) +func.func private @loadA64xi32() -> memref<64xi32> + +#map6 = affine_map<(d0) -> (d0 + 6)> + +func.func @entry() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c5 = arith.constant 5 : index + + %buffi32 = func.call @loadA64xi32() : () -> (memref<64xi32>) + + %v0 = vector.transfer_read %buffi32[%c0], %c0_i32 : memref<64xi32>, vector<8xi32> + func.call @printv8xi32(%v0) : (vector<8xi32>) -> () + + %v5 = vector.transfer_read %buffi32[%c5], %c0_i32 : memref<64xi32>, vector<8xi32> + func.call @printv8xi32(%v5) : (vector<8xi32>) -> () + + %idx6 = affine.apply #map6(%c0) + %v6 = vector.transfer_read %buffi32[%idx6], %c0_i32 : memref<64xi32>, vector<8xi32> + func.call @printv8xi32(%v6) : (vector<8xi32>) -> () + + return %c0_i32 : i32 +} + +// CHECK-LABEL: vector<8xi32>[ 0 1 2 3 4 5 6 7 ] +// CHECK-LABEL: vector<8xi32>[ 5 6 7 8 9 10 11 12 ] +// CHECK-LABEL: vector<8xi32>[ 6 7 8 9 10 11 12 13 ] +// CHECK-LABEL: SUCCESS diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc new file mode 100644 index 0000000000..a81ee4f46c --- /dev/null +++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc @@ -0,0 +1,11 @@ +#include + +int entry(void); + +int main(void) { + int r = entry(); + if (r) + printf("ERROR: %d", r); + printf("SUCCESS"); + return r; +} diff --git a/tools/aie-opt/aie-opt.cpp b/tools/aie-opt/aie-opt.cpp index 480f5d7f8a..c20d2d8ba9 100644 --- a/tools/aie-opt/aie-opt.cpp +++ b/tools/aie-opt/aie-opt.cpp @@ -25,6 +25,7 @@ #include "aie/Dialect/ADF/ADFDialect.h" #include "aie/Dialect/AIE/IR/AIEDialect.h" #include "aie/Dialect/AIE/Transforms/AIEPasses.h" +#include "aie/Dialect/AIEVec/Analysis/Passes.h" #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h" #include "aie/Dialect/AIEVec/Pipelines/Passes.h" #include "aie/Dialect/AIEVec/Transforms/Passes.h" @@ -40,6 +41,7 @@ int main(int argc, char **argv) { xilinx::registerConversionPasses(); aie::registerAIEPasses(); xilinx::AIEX::registerAIEXPasses(); + xilinx::aievec::registerAIEVecAnalysisPasses(); xilinx::aievec::registerAIEVecPasses(); xilinx::aievec::registerAIEVecPipelines();