diff --git a/include/aie-c/TargetModel.h b/include/aie-c/TargetModel.h index c2c026fd37..59da2f2bf9 100644 --- a/include/aie-c/TargetModel.h +++ b/include/aie-c/TargetModel.h @@ -42,6 +42,10 @@ DEFINE_C_API_STRUCT(AieTargetModel, uint64_t); MLIR_CAPI_EXPORTED AieTargetModel aieGetTargetModel(uint32_t device); +/// Returns the data bus width for the target model. +MLIR_CAPI_EXPORTED uint32_t +aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel); + /// Returns the number of columns in the target model. MLIR_CAPI_EXPORTED int aieTargetModelColumns(AieTargetModel targetModel); diff --git a/include/aie/Dialect/AIE/IR/AIETargetModel.h b/include/aie/Dialect/AIE/IR/AIETargetModel.h index e9f5de7680..a6ec03d230 100644 --- a/include/aie/Dialect/AIE/IR/AIETargetModel.h +++ b/include/aie/Dialect/AIE/IR/AIETargetModel.h @@ -61,6 +61,9 @@ class AIETargetModel { /// Return the target architecture. virtual AIEArch getTargetArch() const = 0; + /// Return the data bus width of the device. + virtual uint32_t getAddressGenGranularity() const = 0; + /// Return the number of columns in the device. virtual int columns() const = 0; @@ -293,6 +296,8 @@ class AIE2TargetModel : public AIETargetModel { AIEArch getTargetArch() const override; + uint32_t getAddressGenGranularity() const override { return 32; } + std::optional getMemWest(TileID src) const override; std::optional getMemEast(TileID src) const override; std::optional getMemNorth(TileID src) const override; @@ -352,6 +357,8 @@ class VC1902TargetModel : public AIE1TargetModel { public: VC1902TargetModel() = default; + uint32_t getAddressGenGranularity() const override { return 32; } + int columns() const override { return 50; } int rows() const override { return 9; /* One Shim row and 8 Core rows. */ } @@ -532,6 +539,8 @@ class VirtualizedNPUTargetModel : public BaseNPUTargetModel { public: VirtualizedNPUTargetModel(int _cols) : cols(_cols) {} + uint32_t getAddressGenGranularity() const override { return 32; } + int columns() const override { return cols; } bool isShimNOCTile(int col, int row) const override { return row == 0; } diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 39ce49ada8..09f950f865 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -567,17 +567,20 @@ def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> { } // Push BD to Queue -def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> { +def AIE_NpuPushQueueOp: AIEX_Op<"npu.push_queue", []> { let summary = "bd queue push operator"; let arguments = ( - ins FlatSymbolRefAttr:$metadata, + ins I32Attr:$column, + I32Attr:$row, + DMAChannelDir:$direction, + I32Attr:$channel, BoolAttr:$issue_token, I32Attr:$repeat_count, I32Attr:$bd_id ); let results = (outs ); let assemblyFormat = [{ - attr-dict + `(` $column `,` $row `,` $direction `:` $channel `)` attr-dict }]; let hasVerifier = 1; let description = [{ @@ -589,10 +592,10 @@ def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> { def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> { let summary = "write32 operator"; let arguments = ( - ins I32Attr:$column, - I32Attr:$row, - UI32Attr:$address, - UI32Attr:$value + ins UI32Attr:$address, + UI32Attr:$value, + OptionalAttr:$column, + OptionalAttr:$row ); let results = (outs ); let assemblyFormat = [{ @@ -623,12 +626,28 @@ def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> { }]; } -// WRITEBD_EXTEND_SHIMTILE -def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> { +// XAIE_IO_CUSTOM_OP_BEGIN + 1 (address patch) +def AIE_NpuAddressPatchOp: AIEX_Op<"npu.address_patch", []> { + let summary = "address patch operator"; + let arguments = ( + ins UI32Attr:$addr, + I32Attr:$arg_idx, + I32Attr:$arg_plus + ); + let results = (outs ); + let assemblyFormat = [{ + attr-dict + }]; + let description = [{ + address patch operator + }]; +} + +// NPU Bd Write operation +def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { let summary = "dma operator"; let arguments = ( ins I32Attr:$column, - I32Attr:$column_num, I32Attr:$ddr_id, I32Attr:$bd_id, I32Attr:$buffer_length, @@ -646,6 +665,7 @@ def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> { I32Attr:$iteration_size, I32Attr:$iteration_stride, I32Attr:$next_bd, + I32Attr:$row, I32Attr:$use_next_bd, I32Attr:$valid_bd, I32Attr:$lock_rel_val, @@ -658,7 +678,7 @@ def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> { let assemblyFormat = [{ attr-dict }]; let hasVerifier = 1; let description = [{ - writebd_shimtile operator + writebd operator }]; } diff --git a/lib/CAPI/TargetModel.cpp b/lib/CAPI/TargetModel.cpp index 9c41828871..bd5b33bd6e 100644 --- a/lib/CAPI/TargetModel.cpp +++ b/lib/CAPI/TargetModel.cpp @@ -28,6 +28,10 @@ AieTargetModel aieGetTargetModel(uint32_t device) { xilinx::AIE::getTargetModel(static_cast(device))); } +uint32_t aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel) { + return unwrap(targetModel).getAddressGenGranularity(); +} + int aieTargetModelColumns(AieTargetModel targetModel) { return unwrap(targetModel).columns(); } diff --git a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp index 059c2101a4..6a0cb062da 100644 --- a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp +++ b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp @@ -1584,9 +1584,10 @@ class MaxOpConversion : public mlir::ConvertOpToLLVMPattern { // create xllvm intrinsic Value maxOp = nullptr; if (llvm::isa(resultScaTy)) { - // create constant for cmp + // create constant for third operand `cmp` + // Note: `cmp` is implicitly treated as `sign` to the vmax intrinsic auto cmpCst = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1)); SmallVector operands{adaptor.getLhs(), adaptor.getRhs(), cmpCst}; if (resultBitWidth == 8) { maxOp = rewriter.create( @@ -1681,9 +1682,10 @@ class MinOpConversion : public mlir::ConvertOpToLLVMPattern { // create xllvm intrinsic Value minOp = nullptr; if (llvm::isa(resultScaTy)) { - // create constant for cmp + // create constant for third operand `cmp` + // Note: `cmp` is implicitly treated as `sign` to the vmin intrinsic auto cmpCst = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1)); SmallVector operands{adaptor.getLhs(), adaptor.getRhs(), cmpCst}; if (resultBitWidth == 8) { minOp = rewriter.create( diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 00a9a89137..c9bb260637 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Iterators.h" @@ -782,147 +783,12 @@ struct AIEObjectFifoStatefulTransformPass return lcm; } - // Recursively calls itself if it finds a nested for loop. - // Returns the next index to use to uniquely identify operations - // on the body of the innerLoop. - int identifyDependencies(scf::ForOp outerLoop, scf::ForOp innerLoop, - std::vector &operations, - DenseMap &opIndex, - std::vector> &dependencies, - int startIndex) { - Block *body = innerLoop.getBody(); - auto withoutTerminator = --body->end(); - int index = startIndex; - for (auto op = body->begin(); op != withoutTerminator; ++op) { - operations.push_back(&*op); - opIndex[&*op] = index; - - // identify dependencies - auto numOperands = op->getNumOperands(); - std::vector dependecyIndices; - for (int i = 0; static_cast(i) < numOperands; i++) { - auto operand = op->getOperand(i); - int dependencyIndex = -1; - - if (operand == outerLoop.getInductionVar()) { - dependencyIndex = LOOP_VAR_DEPENDENCY; - } else { - if (auto definingOp = operand.getDefiningOp(); - opIndex.find(definingOp) != opIndex.end()) - dependencyIndex = opIndex[definingOp]; - } - dependecyIndices.push_back(dependencyIndex); - } - dependencies.push_back(dependecyIndices); - - index++; - - // if op was a nested for-loop, also keep track of dependencies inside it - if (auto nestedLoop = dyn_cast(op)) - index = identifyDependencies(outerLoop, nestedLoop, operations, opIndex, - dependencies, index); - } - return index; - } - - // Replace operands of cloned operation with results from other - // duplicated operations based on the index of the original - // operation and its dependencies. - void replaceOperands(OpBuilder &builder, Operation *clone, - int originalOpIndex, Value base, int64_t step, - bool inLoop, int currentDuplication, - std::vector> &dependencies, - std::vector &duplicatedOperations) { - auto numOperands = clone->getNumOperands(); - for (int operandIndex = 0; - static_cast(operandIndex) < numOperands; operandIndex++) { - - if (int originalDependencyIndex = - dependencies[originalOpIndex][operandIndex]; - originalDependencyIndex >= 0) { - // replace the operand with the result of operation with - // same index in current duplication - auto duplicatedOp = duplicatedOperations[originalDependencyIndex]; - Value result = duplicatedOp->getResult(0); - clone->setOperand(operandIndex, result); - - } else if (originalDependencyIndex == LOOP_VAR_DEPENDENCY) { - int64_t increment_value; - if (inLoop) - // +1 because we do not duplicate original loop body - increment_value = (currentDuplication + 1) * step; - else - increment_value = currentDuplication * step; - - auto increment = builder.create( - builder.getUnknownLoc(), builder.getIndexAttr(increment_value)); - auto sum = builder.create(builder.getUnknownLoc(), - builder.getIndexType(), base, - increment->getResult(0)); - clone->setOperand(operandIndex, sum->getResult(0)); - } - } - duplicatedOperations.push_back(clone); - } - - // Function that duplicates given operations for the given number - // of times. !!! Assumes builder insertion point is set. !!! - // If there is a dependency on a loop induction variable, the given - // base mlir::Value is used to resolve it. - void duplicateBlock(OpBuilder &builder, int numDuplications, - std::vector &operations, - std::vector> &dependencies, Value base, - int64_t step, bool inLoop) { - std::vector duplicatedOperations; // operations in current - // Recursive function to replace operands, uses recursion to handle nested - // loop structures. - std::function replaceOpsNested = - [&](Operation *op, unsigned &opIndex, - unsigned numDuplications) -> void { - if (auto loopOp = dyn_cast(op)) { - Block *body = loopOp.getBody(); - auto withoutTerminator = --body->end(); - // NOTE(jornt): This only handles the cases where the nested scf::for is - // located at the start of the body. This should be the most common - // case, but is not fully generic. - if (auto nestedLoop = dyn_cast(body->begin())) { - opIndex++; - replaceOperands(builder, nestedLoop, opIndex, base, step, inLoop, - numDuplications, dependencies, duplicatedOperations); - replaceOpsNested(nestedLoop, opIndex, numDuplications); - } else { - for (auto loopBodyOp = body->begin(); loopBodyOp != withoutTerminator; - ++loopBodyOp) { - opIndex++; - replaceOperands(builder, &*loopBodyOp, opIndex, base, step, inLoop, - numDuplications, dependencies, - duplicatedOperations); - } - } - } - }; - - // duplication iteration - for (int i = 0; i < numDuplications; i++) { - duplicatedOperations.clear(); - for (unsigned opIndex = 0; opIndex < operations.size(); opIndex++) { - // for each operand, check whether there was a dependecy - auto op = operations[opIndex]; - auto clone = op->clone(); - replaceOperands(builder, clone, opIndex, base, step, inLoop, i, - dependencies, duplicatedOperations); - builder.insert(clone); - replaceOpsNested(clone, opIndex, i); - } - } - } - // Function that unrolls for-loops that contain objectFifo operations. - void unrollForLoops(DeviceOp &device, OpBuilder &builder, - std::set objectFifoTiles) { + LogicalResult unrollForLoops(DeviceOp &device, OpBuilder &builder, + std::set objectFifoTiles) { for (auto coreOp : device.getOps()) { if (objectFifoTiles.count(coreOp.getTileOp()) > 0) { - coreOp.walk([&](scf::ForOp forLoop) { + WalkResult res = coreOp.walk([&](scf::ForOp forLoop) { // look for operations on objectFifos // when multiple fifos in same loop, must use the smallest // common multiplier as the unroll factor @@ -942,91 +808,20 @@ struct AIEObjectFifoStatefulTransformPass computeLCM(objFifoSizes); // also counts original loop body if (found) { - std::vector - operations; // operations in original loop body, without - // terminator operation - DenseMap - opIndex; // maps operations of original loop body to their - // position in it - std::vector> - dependencies; // index in first vecotr corresponds to position - // in original loop body dependency vector has - // size equal to number of operands of that - // operation: - // * if LOOP_VAR_DEPENDENCY : operand is - // dependent on loop induction variable - // * if -1 : operand is not dependent on any - // operation in loop body - // * if >=0: operand is dependent on operation - // with that index in original loop body - - // find new loop size and step - auto old_upper_bound = forLoop.getUpperBound() - .getDefiningOp() - .getValue(); - int64_t old_upper_value = - llvm::dyn_cast(old_upper_bound).getInt(); - auto old_lower_bound = forLoop.getLowerBound() - .getDefiningOp() - .getValue(); - int64_t old_lower_value = - llvm::dyn_cast(old_lower_bound).getInt(); - auto old_step = - forLoop.getStep().getDefiningOp().getValue(); - int64_t old_step_value = - llvm::dyn_cast(old_step).getInt(); - int64_t num_iter = - (old_upper_value - old_lower_value) / old_step_value; - - int64_t num_unrolls; // number of times to unroll loop, not counting - // original body - - identifyDependencies(forLoop, forLoop, operations, opIndex, - dependencies, 0); - - if (num_iter <= unrollFactor) { - // duplicate loop body and remove loop - num_unrolls = num_iter; - builder.setInsertionPointAfter(forLoop); - duplicateBlock(builder, num_unrolls, operations, dependencies, - forLoop.getLowerBound(), old_step_value, false); - forLoop.getOperation()->erase(); - - } else { - num_unrolls = unrollFactor - 1; // -1 without original loop body - - // create new upper bound and step - int64_t new_step_value = - static_cast(unrollFactor) * old_step_value; - int64_t remainder = (old_upper_value - old_lower_value) % - new_step_value / old_step_value; - builder.setInsertionPoint(forLoop); - if (remainder > 0) { - int64_t new_upper_bound = (old_upper_value - old_lower_value) / - new_step_value * new_step_value; - auto uBound = builder.create( - builder.getUnknownLoc(), - builder.getIndexAttr(new_upper_bound)); - forLoop.setUpperBound(uBound); - } - auto new_step = builder.create( - builder.getUnknownLoc(), - builder.getIndexAttr(new_step_value)); - forLoop.setStep(new_step); - - // duplicate loop body, insert before terminator operation - builder.setInsertionPoint(&body->back()); - duplicateBlock(builder, num_unrolls, operations, dependencies, - forLoop.getInductionVar(), old_step_value, true); - // duplicate remainder operations after loop body - builder.setInsertionPointAfter(forLoop); - duplicateBlock(builder, remainder, operations, dependencies, - forLoop.getUpperBound(), old_step_value, false); + if (failed(mlir::loopUnrollByFactor(forLoop, unrollFactor))) { + forLoop.emitOpError() + << "could not be unrolled with unrollFactor: " << unrollFactor + << "\n"; + return WalkResult::interrupt(); } } + return WalkResult::advance(); }); + if (res.wasInterrupted()) + return failure(); } } + return success(); } /// Function used to create a UseLockOp based on input parameters. @@ -1364,7 +1159,9 @@ struct AIEObjectFifoStatefulTransformPass //===------------------------------------------------------------------===// // Unroll for loops //===------------------------------------------------------------------===// - unrollForLoops(device, builder, objectFifoTiles); + if (failed(unrollForLoops(device, builder, objectFifoTiles))) { + signalPassFailure(); + } //===------------------------------------------------------------------===// // Replace ops diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index f2c9ebc433..e1102b4fe3 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -66,8 +66,16 @@ LogicalResult AIEX::BroadcastPacketOp::verify() { LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { MemRefType buffer = getMemref().getType(); - if (buffer.getElementTypeBitWidth() != 32) - return emitOpError("must be used with memref type with element width 32."); + const auto &targetModel = AIE::getTargetModel(*this); + auto addressGranularity = targetModel.getAddressGenGranularity(); + if (buffer.getElementTypeBitWidth() > addressGranularity) { + return emitOpError("Maximum element bit width allowed is ") + << addressGranularity << "bits. "; + } else if ((buffer.getNumElements() * buffer.getElementTypeBitWidth()) < + addressGranularity) { + return emitOpError("Minimum data transfer size required is ") + << addressGranularity << "bits. "; + } if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) { return getConstantIntValue(s).has_value(); })) @@ -114,9 +122,9 @@ LogicalResult AIEX::NpuDmaWaitOp::verify() { return success(); } -LogicalResult AIEX::NpuShimTilePushQueueOp::verify() { +LogicalResult AIEX::NpuPushQueueOp::verify() { const auto &targetModel = AIE::getTargetModel(*this); - auto numBds = targetModel.getNumBDs(0, 0); // assume shim + auto numBds = targetModel.getNumBDs(getColumn(), getRow()); if (getBdId() > numBds) return emitOpError("BD ID exceeds the maximum ID."); if (getRepeatCount() > 255) @@ -124,9 +132,9 @@ LogicalResult AIEX::NpuShimTilePushQueueOp::verify() { return success(); } -LogicalResult AIEX::NpuWriteBdExShimTileOp::verify() { +LogicalResult AIEX::NpuWriteBdOp::verify() { const auto &targetModel = AIE::getTargetModel(*this); - auto numBds = targetModel.getNumBDs(0, 0); // assume shim + auto numBds = targetModel.getNumBDs(getColumn(), getRow()); if (getBdId() > numBds) return emitOpError("BD ID exceeds the maximum ID."); if (getD0Size() > 0x3FF) diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 564d011e36..95f090776c 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -110,73 +110,48 @@ struct RtpToNpuPattern : OpConversionPattern { IntegerAttr row = IntegerAttr::get(i32ty, r); IntegerAttr address = IntegerAttr::get(ui32ty, rtp_buffer_addr); IntegerAttr value = IntegerAttr::get(i32ty, v); - rewriter.create(op->getLoc(), column.getInt(), row.getInt(), - address.getUInt(), value.getInt()); + rewriter.create(op->getLoc(), address.getUInt(), + value.getInt(), column, row); rewriter.eraseOp(op); return success(); } }; -struct PushToNpuPattern : OpConversionPattern { - -private: - ShimDMAllocationGetter &allocGetter; +struct PushToNpuPattern : OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; - PushToNpuPattern(MLIRContext *context, ShimDMAllocationGetter &getter, - PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), allocGetter(getter) {} + PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) + : OpConversionPattern(context, benefit) {} LogicalResult - matchAndRewrite(NpuShimTilePushQueueOp op, OpAdaptor adaptor, + matchAndRewrite(NpuPushQueueOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto *ctx = op->getContext(); - auto i32ty = IntegerType::get(ctx, 32); - auto zero = IntegerAttr::get(i32ty, 0); - auto ui32ty = - IntegerType::get(ctx, 32, IntegerType::SignednessSemantics::Unsigned); - bool send_tct = op.getIssueToken(); - uint32_t channel_num = 0; - - // initialize fields to zero - auto dev = op->getParentOfType(); - if (!dev) - return op->emitOpError("couldn't find parent of type DeviceOp"); - - auto infoOp = allocGetter.get(dev, op.getMetadata()); - if (!infoOp) - return op->emitOpError("couldn't find shim_dma_allocation op."); - - auto channelDir = infoOp->getChannelDir(); - bool isMM2S = channelDir == AIE::DMAChannelDir::MM2S; - channel_num += infoOp->getChannelIndex(); - - IntegerAttr column = IntegerAttr::get(i32ty, infoOp->getCol()); + // the offset of the task queue register in the tile uint32_t queue_offset; - if (isMM2S) + if (op.getDirection() == AIE::DMAChannelDir::MM2S) queue_offset = 0x1D214; else queue_offset = 0x1D204; - if (channel_num == 1) + if (op.getChannel() == 1) queue_offset += 0x8; - IntegerAttr address = IntegerAttr::get(ui32ty, queue_offset); - // value + // the value to write uint32_t bd_id = op.getBdId(); uint32_t repeat_cnt = op.getRepeatCount(); uint32_t cmd = 0; cmd |= bd_id & 0xF; cmd |= (repeat_cnt & 0xFF) << 16; - if (send_tct) + if (op.getIssueToken()) cmd |= 0x80000000; - IntegerAttr value = IntegerAttr::get(ui32ty, cmd); - rewriter.create(op->getLoc(), column.getInt(), zero.getInt(), - address.getUInt(), value.getUInt()); + auto i32ty = IntegerType::get(op->getContext(), 32); + auto column = IntegerAttr::get(i32ty, op.getColumn()); + auto row = IntegerAttr::get(i32ty, 0); + rewriter.create(op->getLoc(), queue_offset, cmd, column, row); rewriter.eraseOp(op); return success(); } @@ -216,7 +191,6 @@ struct DmaToNpuPattern : OpConversionPattern { // initialize fields to zero auto column = zero; - auto column_num = zero; auto ddr_id = zero; auto bd_id = zero; auto buffer_length = zero; @@ -234,6 +208,7 @@ struct DmaToNpuPattern : OpConversionPattern { auto iteration_size = zero; auto iteration_stride = zero; auto next_bd = zero; + auto row = zero; auto use_next_bd = zero; auto valid_bd = zero; auto lock_rel_val = zero; @@ -255,12 +230,25 @@ struct DmaToNpuPattern : OpConversionPattern { llvm::reverse(op.getMixedOffsets()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); + MemRefType buffer = op.getMemref().getType(); + const auto &targetModel = AIE::getTargetModel(op); + auto elemWidth = buffer.getElementTypeBitWidth(); + auto addressGranularity = targetModel.getAddressGenGranularity(); + if (elemWidth < addressGranularity) { + if (!strides.empty()) { + for (int i = 0; i < 3; i++) { + strides[i] = (strides[i] * elemWidth) / addressGranularity; + } + } + if (!sizes.empty()) + sizes[0] = (sizes[0] * elemWidth) / addressGranularity; + if (!offsets.empty()) + offsets[0] = (offsets[0] * elemWidth) / addressGranularity; + } + // column column = IntegerAttr::get(i32ty, col); - // column_num - column_num = IntegerAttr::get(i32ty, 1); - // ddr_id Block &entryBB = op->getParentOfType().getBody().front(); int arg_idx = -1; @@ -364,15 +352,23 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); - (void)rewriter.create( - op->getLoc(), column, column_num, ddr_id, bd_id, buffer_length, - buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, - d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current, - iteration_size, iteration_stride, next_bd, use_next_bd, valid_bd, + rewriter.create( + op->getLoc(), column, ddr_id, bd_id, buffer_length, buffer_offset, + enable_packet, out_of_order_id, packet_id, packet_type, d0_size, + d0_stride, d1_size, d1_stride, d2_stride, iteration_current, + iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); - rewriter.create(op->getLoc(), op.getMetadataAttr(), - issue_token, repeat_count, bd_id); + const AIE::AIETargetModel &tm = + op->getParentOfType().getTargetModel(); + + uint32_t addr = + (col << tm.getColumnShift()) | (0x1D004 + op.getId() * 0x20); + rewriter.create(op->getLoc(), addr, arg_idx, offset); + + rewriter.create( + op->getLoc(), column, row, infoOp->getChannelDirAttr(), + infoOp->getChannelIndexAttr(), issue_token, repeat_count, bd_id); rewriter.eraseOp(op); return success(); @@ -406,15 +402,13 @@ struct DmaWaitToNpuPattern : OpConversionPattern { if (!shimDmaAllocOp) { return op->emitError("couldn't find shim_dma_allocation op"); } - AIE::DMAChannelDir channelDir = shimDmaAllocOp->getChannelDir(); - int channel = shimDmaAllocOp->getChannelIndex(); - int direction = (int)(channelDir == AIE::DMAChannelDir::MM2S); - int column = shimDmaAllocOp->getCol(); // Create with `column_num == 1` and `row_num == 1` to check for a single // column and row. Row is always 0 for shim tiles. - (void)rewriter.replaceOpWithNewOp(op, column, 0, direction, - channel, 1, 1); + (void)rewriter.replaceOpWithNewOp( + op, shimDmaAllocOp->getCol(), /* row */ 0, + static_cast(shimDmaAllocOp->getChannelDir()), + shimDmaAllocOp->getChannelIndex(), 1, 1); return success(); } }; @@ -433,12 +427,12 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase { target.addIllegalOp(); target.addIllegalOp(); target.addIllegalOp(); - target.addIllegalOp(); + target.addIllegalOp(); RewritePatternSet patterns(&getContext()); patterns.insert(&getContext(), cachingGetter); patterns.insert(&getContext(), cachingGetter); - patterns.insert(&getContext(), cachingGetter); + patterns.insert(&getContext()); patterns.insert(&getContext()); if (failed(applyPartialConversion(device, target, std::move(patterns)))) diff --git a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp index b7aa242134..7e3b729a46 100644 --- a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp @@ -49,13 +49,11 @@ struct AIEXToStandardPass : AIEXToStandardBase { RewritePatternSet removepatterns(&getContext()); removepatterns.add>(m.getContext(), m); removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), - m); + removepatterns.add>(m.getContext(), m); removepatterns.add>(m.getContext(), m); removepatterns.add>(m.getContext(), m); removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), - m); + removepatterns.add>(m.getContext(), m); if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) signalPassFailure(); diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp index 7f17c3ad83..d0b83d1212 100644 --- a/lib/Targets/AIETargetNPU.cpp +++ b/lib/Targets/AIETargetNPU.cpp @@ -27,14 +27,12 @@ using namespace xilinx; using namespace xilinx::AIE; using namespace xilinx::AIEX; -namespace { +#define TXN_OPC_WRITE 0x0 +#define TXN_OPC_BLOCKWRITE 0x1 +#define TXN_OPC_TCT 0x80 +#define TXN_OPC_DDR_PATCH 0x81 -std::vector getProlog() { - return {0x00000011, 0x01000405, 0x01000100, 0x0B590100, 0x000055FF, - 0x00000001, 0x00000010, 0x314E5A5F, 0x635F5F31, 0x676E696C, - 0x39354E5F, 0x6E693131, 0x5F727473, 0x64726F77, 0x00004573, - 0x07BD9630, 0x000055FF}; -} +namespace { // Example: // - instructions = {3,4,5} @@ -52,104 +50,165 @@ reserveAndGetTail(std::vector &instructions, uint64_t tailSize) { void appendSync(std::vector &instructions, NpuSyncOp op) { - auto words = reserveAndGetTail(instructions, 2); + auto words = reserveAndGetTail(instructions, 4); - uint32_t opCode = 3; - words[0] |= (opCode & 0xff) << 24; - words[0] |= (op.getColumn() & 0xff) << 16; - words[0] |= (op.getRow() & 0xff) << 8; - words[0] |= op.getDirection() & 0x1; + // XAIE_IO_CUSTOM_OP_TCT + words[0] = TXN_OPC_TCT; - words[1] |= (op.getChannel() & 0xff) << 24; - words[1] |= (op.getColumnNum() & 0xff) << 16; - words[1] |= (op.getRowNum() & 0xff) << 8; + words[1] = words.size() * sizeof(uint32_t); // Operation Size + + words[2] |= static_cast(op.getDirection()) & 0xff; + words[2] |= (op.getRow() & 0xff) << 8; + words[2] |= (op.getColumn() & 0xff) << 16; + + words[3] |= (op.getRowNum() & 0xff) << 8; + words[3] |= (op.getColumnNum() & 0xff) << 16; + words[3] |= (op.getChannel() & 0xff) << 24; } void appendWrite32(std::vector &instructions, NpuWrite32Op op) { - auto words = reserveAndGetTail(instructions, 3); + auto words = reserveAndGetTail(instructions, 6); + const AIETargetModel &tm = op->getParentOfType().getTargetModel(); + + // XAIE_IO_WRITE + words[0] = TXN_OPC_WRITE; + words[1] = 0; + words[2] = op.getAddress(); + auto col = op.getColumn(); + auto row = op.getRow(); + if (col && row) + words[2] = ((*col & 0xff) << tm.getColumnShift()) | + ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF); + words[3] = 0; + words[4] = op.getValue(); // Value + words[5] = words.size() * sizeof(uint32_t); // Operation Size +} + +void appendAddressPatch(std::vector &instructions, + NpuAddressPatchOp op) { + + auto words = reserveAndGetTail(instructions, 12); + + // XAIE_IO_CUSTOM_OP_DDR_PATCH + words[0] = TXN_OPC_DDR_PATCH; + words[1] = words.size() * sizeof(uint32_t); // Operation Size - uint32_t opCode = 2; - words[0] |= (opCode & 0xff) << 24; - words[0] |= (op.getColumn() & 0xff) << 16; - words[0] |= (op.getRow() & 0xff) << 8; + words[6] = op.getAddr(); + words[7] = 0; - words[1] = op.getAddress(); + words[8] = op.getArgIdx(); + words[9] = 0; - words[2] = op.getValue(); + words[10] = op.getArgPlus(); + words[11] = 0; } void appendWriteBdShimTile(std::vector &instructions, - NpuWriteBdExShimTileOp op) { + NpuWriteBdOp op) { - auto words = reserveAndGetTail(instructions, 10); + auto words = reserveAndGetTail(instructions, 12); + const AIETargetModel &tm = op->getParentOfType().getTargetModel(); - uint32_t opCode = 6; - words[0] |= (opCode & 0xff) << 24; - words[0] |= (op.getColumn() & 0xff) << 16; - words[0] |= (op.getColumnNum() & 0xff) << 8; - words[0] |= (op.getDdrId() & 0xf) << 4; - words[0] |= (op.getBdId() & 0xf); + // XAIE_IO_BLOCKWRITE + words[0] = TXN_OPC_BLOCKWRITE; + words[1] = 0; - // TODO: Address Incr - // words[1] = ... + // RegOff + auto bd_id = op.getBdId(); + uint32_t bd_addr = (op.getColumn() << tm.getColumnShift()) | + (op.getRow() << tm.getRowShift()) | + (0x1D000 + bd_id * 0x20); + words[2] = bd_addr; // ADDR + words[3] = words.size() * sizeof(uint32_t); // Operation Size - words[2] = op.getBufferLength(); - words[3] = op.getBufferOffset(); + // DMA_BDX_0 + words[4] = op.getBufferLength(); + // DMA_BDX_1 + words[5] = op.getBufferOffset(); + + // DMA_BDX_2 // En Packet , OoO BD ID , Packet ID , Packet Type - words[4] |= (op.getEnablePacket() & 0x1) << 30; - words[4] |= (op.getOutOfOrderId() & 0x3f) << 24; - words[4] |= (op.getPacketId() & 0x1f) << 19; - words[4] |= (op.getPacketType() & 0x7) << 16; + words[6] |= (op.getEnablePacket() & 0x1) << 30; + words[6] |= (op.getOutOfOrderId() & 0x3f) << 24; + words[6] |= (op.getPacketId() & 0x1f) << 19; + words[6] |= (op.getPacketType() & 0x7) << 16; + // DMA_BDX_3 // TODO: Secure Access - words[5] |= (op.getD0Size() & 0x3ff) << 20; - words[5] |= op.getD0Stride() & 0xfffff; + words[7] |= (op.getD0Size() & 0x3ff) << 20; + words[7] |= op.getD0Stride() & 0xfffff; - words[6] = 0x80000000; // burst length; - words[6] |= (op.getD1Size() & 0x3ff) << 20; - words[6] |= op.getD1Stride() & 0xfffff; + // DMA_BDX_4 + words[8] = 0x80000000; // burst length; + words[8] |= (op.getD1Size() & 0x3ff) << 20; + words[8] |= op.getD1Stride() & 0xfffff; + // DMA_BDX_5 // TODO: SIMID, AxCache, AXQoS - words[7] = op.getD2Stride() & 0xfffff; + words[9] = op.getD2Stride() & 0xfffff; - words[8] |= (op.getIterationCurrent() & 0x3f) << 26; - words[8] |= (op.getIterationSize() & 0x3f) << 20; - words[8] |= op.getIterationStride() & 0xfffff; + // DMA_BDX_6 + words[10] |= (op.getIterationCurrent() & 0x3f) << 26; + words[10] |= (op.getIterationSize() & 0x3f) << 20; + words[10] |= op.getIterationStride() & 0xfffff; + // DMA_BDX_7 // TODO: TLAST Suppress - words[9] |= (op.getNextBd() & 0xf) << 27; - words[9] |= (op.getUseNextBd() & 0x1) << 26; - words[9] |= (op.getValidBd() & 0x1) << 25; - words[9] |= (op.getLockRelVal() & 0xef) << 18; - words[9] |= (op.getLockRelId() & 0xf) << 13; - words[9] |= (op.getLockAcqEnable() & 0x1) << 12; - words[9] |= (op.getLockAcqVal() & 0xef) << 5; - words[9] |= op.getLockAcqId() & 0xf; + words[11] |= (op.getNextBd() & 0xf) << 27; + words[11] |= (op.getUseNextBd() & 0x1) << 26; + words[11] |= (op.getValidBd() & 0x1) << 25; + words[11] |= (op.getLockRelVal() & 0xef) << 18; + words[11] |= (op.getLockRelId() & 0xf) << 13; + words[11] |= (op.getLockAcqEnable() & 0x1) << 12; + words[11] |= (op.getLockAcqVal() & 0xef) << 5; + words[11] |= op.getLockAcqId() & 0xf; } } // namespace std::vector xilinx::AIE::AIETranslateToNPU(ModuleOp module) { - std::vector instructions = getProlog(); + std::vector instructions; + + auto words = reserveAndGetTail(instructions, 4); + + // setup txn header + words[0] = 0x06030100; + words[1] = 0x00000105; DeviceOp deviceOp = *module.getOps().begin(); auto funcOps = deviceOp.getOps(); + int count = 0; for (auto f : funcOps) { if (f.isDeclaration()) continue; Block &entry = f.getRegion().front(); for (auto &o : entry) { llvm::TypeSwitch(&o) - .Case([&](auto op) { appendSync(instructions, op); }) - .Case([&](auto op) { appendWrite32(instructions, op); }) - .Case( - [&](auto op) { appendWriteBdShimTile(instructions, op); }); + .Case([&](auto op) { + count++; + appendSync(instructions, op); + }) + .Case([&](auto op) { + count++; + appendWrite32(instructions, op); + }) + .Case([&](auto op) { + count++; + appendAddressPatch(instructions, op); + }) + .Case([&](auto op) { + count++; + appendWriteBdShimTile(instructions, op); + }); } } + // write size fields of the txn header + instructions[2] = count; + instructions[3] = instructions.size() * sizeof(uint32_t); return instructions; } diff --git a/programming_examples/basic/dma_transpose/test.cpp b/programming_examples/basic/dma_transpose/test.cpp index fa9a918669..d8cd12ef08 100644 --- a/programming_examples/basic/dma_transpose/test.cpp +++ b/programming_examples/basic/dma_transpose/test.cpp @@ -149,13 +149,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, - kernel.group_id(2)); - auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; @@ -174,7 +174,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel." << std::endl; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index dba7e6c221..e92f4e699c 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -45,12 +45,12 @@ mlir_target?=build/aie_${M}x${K}x${N}.mlir xclbin_target?=build/final_${M}x${K}x${N}.xclbin insts_target?=build/insts_${M}x${K}x${N}.txt -runargs?=-v 1 --warmup 10 --iters 10 +runargs?=-v 1 --warmup 1 --iters 1 kernels_dir=${srcdir}/../../../../aie_kernels/aie2 .PHONY: all -all: ${xclbin_target} ${insts_target} ${targetname}.exe +all: ${xclbin_target} ${targetname}.exe build/%.o: ${kernels_dir}/%.cc mkdir -p ${@D} @@ -82,11 +82,11 @@ sign: ${xclbin_target} ${xclbin_sign} -dev Phoenix -xclbin $< .PHONY: run -run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign +run: ${targetname}.exe ${xclbin_target} export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \ ${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs} -trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign +trace: ${targetname}.exe ${xclbin_target} ${insts_target} export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \ ${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size} ../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 6b27d9f9e3..54276121c8 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -17,26 +17,20 @@ def my_matmul(): K = 288 m = 32 k = 32 - word_size_in = 2 - word_size_out = 4 n_cores = 1 - A_sz_in_i32s = M * K * word_size_in // 4 - B_sz_in_i32s = K * word_size_in // 4 - C_sz_in_bytes = M * word_size_out - C_sz_in_i32s = C_sz_in_bytes // 4 - C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores + A_sz = M * K + B_sz = K + C_sz = M + C_sz_div_n_cores = C_sz // n_cores M_div_m = M // m M_div_m_div_n_cores = M // (m * n_cores) K_div_k = K // k - K_in_i32s = K * word_size_in // 4 - k_in_i32s = k * word_size_in // 4 - m_in_i32s = m * word_size_in // 4 - m_x_k_in_i32s = m * k * word_size_in // 4 - m_x_K_in_i32s = m * K * word_size_in // 4 + m_x_k = m * k + m_x_K = m * K vectorized = True @@ -172,35 +166,35 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz_in_i32s, T.i32()), - T.memref(B_sz_in_i32s, T.i32()), - T.memref(C_sz_in_i32s, T.i32()), + T.memref(A_sz, T.bf16()), + T.memref(B_sz, T.bf16()), + T.memref(C_sz, T.f32()), ) def sequence(A, B, C): npu_dma_memcpy_nd( metadata=inB_fifo_names[0], bd_id=2, mem=B, - sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s], + sizes=[M_div_m_div_n_cores, 1, 1, K], strides=[0, 0, 0], ) for i in range(n_cores): - A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4 - C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4 + A_offset = i * M_div_m_div_n_cores * m * K + C_offset = i * M_div_m_div_n_cores * m npu_dma_memcpy_nd( metadata=memA_fifo_names[i], bd_id=1, mem=A, offsets=[0, 0, 0, A_offset], - sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s], - strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s], + sizes=[M_div_m_div_n_cores, K_div_k, m, k], + strides=[m_x_K, k, K], ) npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, mem=C, offsets=[0, 0, 0, C_offset], - sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s], + sizes=[1, 1, 1, C_sz_div_n_cores], strides=[0, 0, 0], ) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 909fba0c43..ba312aa417 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -22,36 +22,26 @@ def my_matmul(): r = 4 s = 8 t = 4 - word_size_in = 2 - word_size_out = 2 vectorized = True enable_tracing = False trace_size = 65536 - A_sz_in_i32s = M * K * word_size_in // 4 - B_sz_in_i32s = K * N * word_size_in // 4 - C_sz_in_bytes = M * N * word_size_out - C_sz_in_i32s = C_sz_in_bytes // 4 + A_sz = M * K + B_sz = K * N + C_sz = M * N + C_sz_in_bytes = C_sz * 2 M_div_m = M // m K_div_k = K // k N_div_n = N // n tiles = M_div_m * N_div_n - # Matrix A: MxK, submatrices a: mxk - k_in_i32s = k * word_size_in // 4 - K_in_i32s = K * word_size_in // 4 - # Matrix B: KxN, submatrices b: kxn - n_in_i32s = n * word_size_in // 4 - N_in_i32s = N * word_size_in // 4 - k_x_N_in_i32s = k * N * word_size_in // 4 + k_x_N = k * N # Output Matrix C: MxN - n_in_i32s_out = n * word_size_out // 4 - N_in_i32s_out = N * word_size_out // 4 - m_x_N_in_i32s_out = m * N * word_size_out // 4 + m_x_N = m * N with mlir_mod_ctx() as ctx: @@ -169,9 +159,9 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz_in_i32s, T.i32()), - T.memref(B_sz_in_i32s, T.i32()), - T.memref(C_sz_in_i32s, T.i32()), + T.memref(A_sz, T.bf16()), + T.memref(B_sz, T.bf16()), + T.memref(C_sz, T.bf16()), ) def sequence(A, B, C): @@ -189,9 +179,7 @@ def sequence(A, B, C): for tile_row_block in range( (M_div_m + rows_per_block - 1) // rows_per_block ): - C_row_offset_in_i32s = ( - tile_row_block * rows_per_block * m * N * word_size_out // 4 - ) + C_row_offset = tile_row_block * rows_per_block * m * N num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) @@ -199,32 +187,28 @@ def sequence(A, B, C): metadata="outC", bd_id=0, mem=C, - offsets=[0, 0, 0, C_row_offset_in_i32s], - sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out], - strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out], + offsets=[0, 0, 0, C_row_offset], + sizes=[num_tile_rows, N_div_n, m, n], + strides=[m_x_N, n, N], ) for tile_row in range(num_tile_rows): - A_row_offset_in_i32s = ( - ((tile_row_block * rows_per_block) + tile_row) - * m - * K - * word_size_in - // 4 + A_row_offset = ( + ((tile_row_block * rows_per_block) + tile_row) * m * K ) npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, mem=A, - offsets=[0, 0, 0, A_row_offset_in_i32s], - sizes=[N_div_n, K_div_k, m, k_in_i32s], - strides=[0, k_in_i32s, K_in_i32s], + offsets=[0, 0, 0, A_row_offset], + sizes=[N_div_n, K_div_k, m, k], + strides=[0, k, K], ) npu_dma_memcpy_nd( metadata="inB", bd_id=2 * tile_row + 2, mem=B, - sizes=[N_div_n, K_div_k, k, n_in_i32s], - strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], + sizes=[N_div_n, K_div_k, k, n], + strides=[n, k_x_N, N], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index 5fe8711de3..fded6f0de7 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -118,13 +118,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_a = - xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_b = - xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_out = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) { std::cout << "Writing data into buffer objects.\n"; @@ -175,8 +175,13 @@ int main(int argc, const char *argv[]) { std::cout << "Running Kernel.\n"; } auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_out); - run.wait(); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "kernel did not complete. returned status: " << r << "\n"; + return 1; + } auto stop = std::chrono::high_resolution_clock::now(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/matrix_scalar_add/test.cpp b/programming_examples/basic/matrix_scalar_add/test.cpp index c8804fecfc..e405ab9d6b 100644 --- a/programming_examples/basic/matrix_scalar_add/test.cpp +++ b/programming_examples/basic/matrix_scalar_add/test.cpp @@ -138,13 +138,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IMAGE_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IMAGE_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, IMAGE_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IMAGE_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, IMAGE_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -163,7 +163,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index 10e552ba69..9c11596119 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -140,13 +140,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, - kernel.group_id(2)); - auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; @@ -165,7 +165,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel." << std::endl; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 4fe9a7ed9b..fcd6c84632 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -19,7 +19,6 @@ def passthroughKernel(vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - lineWidthInInt32s = lineWidthInBytes // 4 @device(AIEDevice.npu1_1col) def device_body(): @@ -58,9 +57,7 @@ def core_body(): # print(ctx.module.operation.verify()) - tensorSize = N - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(N, T.ui8()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(inTensor, outTensor, notUsed): @@ -70,20 +67,20 @@ def sequence(inTensor, outTensor, notUsed): ShimTile, ddr_id=1, size=trace_size, - offset=tensorSize, + offset=N, ) npu_dma_memcpy_nd( metadata="in", bd_id=0, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, N], ) npu_dma_memcpy_nd( metadata="out", bd_id=1, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, N], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp index f698a3824d..f28691abcd 100644 --- a/programming_examples/basic/passthrough_kernel/test.cpp +++ b/programming_examples/basic/passthrough_kernel/test.cpp @@ -56,9 +56,9 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_out = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE) + trace_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); @@ -87,7 +87,8 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out); run.wait(); // Sync device to host memories diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 30b902f8fb..814f8c7a6a 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -50,9 +50,9 @@ def main(opts): # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4)) # Initialize instruction buffer bo_instr.write(instr_v, 0) @@ -80,7 +80,8 @@ def main(opts): # Run kernel if opts.verbosity >= 1: print("Running Kernel.") - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1) + opcode = 3 + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1) h.wait() bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py index af58a6392b..87c8f33c31 100644 --- a/programming_examples/basic/vector_exp/aie2.py +++ b/programming_examples/basic/vector_exp/aie2.py @@ -17,12 +17,7 @@ # AI Engine structural design function def my_eltwise_exp(): - word_size_in = 2 N = 65536 - N_in_bytes = N * word_size_in - - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 # Tile sizes n = 1024 @@ -103,16 +98,12 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_exp/test.cpp b/programming_examples/basic/vector_exp/test.cpp index deb895d238..9e39a4cc39 100644 --- a/programming_examples/basic/vector_exp/test.cpp +++ b/programming_examples/basic/vector_exp/test.cpp @@ -142,11 +142,11 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -192,7 +192,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_reduce_add/test.cpp b/programming_examples/basic/vector_reduce_add/test.cpp index 85c723d2ec..c24254240c 100644 --- a/programming_examples/basic/vector_reduce_add/test.cpp +++ b/programming_examples/basic/vector_reduce_add/test.cpp @@ -83,11 +83,11 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -136,7 +136,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_reduce_max/test.cpp b/programming_examples/basic/vector_reduce_max/test.cpp index c3e7dd33c4..dc3161d401 100644 --- a/programming_examples/basic/vector_reduce_max/test.cpp +++ b/programming_examples/basic/vector_reduce_max/test.cpp @@ -84,11 +84,11 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -138,7 +138,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_reduce_min/test.cpp b/programming_examples/basic/vector_reduce_min/test.cpp index e43ac32b96..233f87d7de 100644 --- a/programming_examples/basic/vector_reduce_min/test.cpp +++ b/programming_examples/basic/vector_reduce_min/test.cpp @@ -84,11 +84,11 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -138,7 +138,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_scalar_add/test.cpp b/programming_examples/basic/vector_scalar_add/test.cpp index f92f856b37..bb5387827e 100644 --- a/programming_examples/basic/vector_scalar_add/test.cpp +++ b/programming_examples/basic/vector_scalar_add/test.cpp @@ -99,11 +99,11 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -122,7 +122,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index 8d367ced50..b0a957393b 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -17,10 +17,8 @@ def my_vector_scalar(vector_size, trace_size): - word_size_in = 2 N = vector_size - N_in_i32s = N * word_size_in // 4 - N_in_bytes = N_in_i32s * 4 + N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n @@ -82,7 +80,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N_in_i32s, T.i32()) + tensor_ty = T.memref(N, T.i16()) scalar_ty = T.memref(1, T.i32()) @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty) @@ -96,10 +94,8 @@ def sequence(A, F, C): size=trace_size, offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s] - ) - npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s]) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp index fe81d3ba9e..d4acb04292 100644 --- a/programming_examples/basic/vector_scalar_mul/test.cpp +++ b/programming_examples/basic/vector_scalar_mul/test.cpp @@ -65,13 +65,13 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = - xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_outC = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -102,7 +102,9 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); // Sync device to host memories diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index c9e6af8d1b..87f2f9fffe 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -54,10 +54,10 @@ def main(opts): # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5)) # Initialize instruction buffer bo_instr.write(instr_v, 0) @@ -88,7 +88,8 @@ def main(opts): # Run kernel if opts.verbosity >= 1: print("Running Kernel.") - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + opcode = 3 + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) h.wait() bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) diff --git a/programming_examples/basic/vector_vector_add/test.cpp b/programming_examples/basic/vector_vector_add/test.cpp index b23642d10f..f36f6a3328 100644 --- a/programming_examples/basic/vector_vector_add/test.cpp +++ b/programming_examples/basic/vector_vector_add/test.cpp @@ -99,13 +99,13 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -131,7 +131,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_vector_modulo/test.cpp b/programming_examples/basic/vector_vector_modulo/test.cpp index 9bab382f7a..4082961c2e 100644 --- a/programming_examples/basic/vector_vector_modulo/test.cpp +++ b/programming_examples/basic/vector_vector_modulo/test.cpp @@ -99,13 +99,13 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -131,7 +131,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vector_vector_mul/test.cpp b/programming_examples/basic/vector_vector_mul/test.cpp index 8881fba175..52af9beb06 100644 --- a/programming_examples/basic/vector_vector_mul/test.cpp +++ b/programming_examples/basic/vector_vector_mul/test.cpp @@ -99,13 +99,13 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -131,7 +131,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py index 669aacb415..9ee60a3b62 100644 --- a/programming_examples/ml/bottleneck/aie2.py +++ b/programming_examples/ml/bottleneck/aie2.py @@ -501,16 +501,16 @@ def core_body(): yield_([]) # instruction stream generation - activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4 - acitivationsOutSize32b = activationsInSize32b - totalWeightsSize32b = ( + activationsIn = tensorInW * tensorInH * tensorInC + acitivationsOut = activationsIn + totalWeights = ( tensorL1InC * tensorL1OutC + 3 * 3 * tensorL2InC * tensorL2OutC + tensorL3InC * tensorL3OutC - ) // 4 + ) - activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) - weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty) + activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty) + weightsInL3_ty = MemRefType.get((totalWeights,), uint8_ty) @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty) def sequence(inputFromL3, weightsFromL3, outputToL3): @@ -565,16 +565,16 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): npu_write32(0, 4, 0x340D0, 0x10000) # Start trace copy out. - npu_writebd_shimtile( + npu_writebd( bd_id=3, buffer_length=trace_sz_in_i32s, - buffer_offset=acitivationsOutSize32b, + buffer_offset=acitivationsOut, enable_packet=0, out_of_order_id=0, packet_id=0, packet_type=0, column=0, - column_num=1, + row=0, d0_stepsize=0, d0_wrap=0, d1_stepsize=0, @@ -616,19 +616,19 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): metadata="inOF_act_L3L2", bd_id=0, mem=inputFromL3, - sizes=[1, 1, 1, activationsInSize32b], + sizes=[1, 1, 1, activationsIn], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=outputToL3, - sizes=[1, 1, 1, acitivationsOutSize32b], + sizes=[1, 1, 1, acitivationsOut], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=1, mem=weightsFromL3, - sizes=[1, 1, 1, totalWeightsSize32b], + sizes=[1, 1, 1, totalWeights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py index 11e92f55c2..b6fb537a26 100644 --- a/programming_examples/ml/conv2d/aie2.py +++ b/programming_examples/ml/conv2d/aie2.py @@ -25,14 +25,11 @@ actIn = width * in_channels # 32*64 = 2048 bufIn = actIn * 2 # double buffer -actInInt32s = actIn // 4 weights = in_channels * out_channels -weightsInInt32s = weights // 4 actOut = width * out_channels # 32*64 = 2048 bufOut = actOut * 2 # double buffer -actOutInt32s = actOut // 4 def conv2dk1(): @@ -141,9 +138,8 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * in_channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) - memRef_wts_ty = T.memref(weightsInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) + memRef_wts_ty = T.memref(weights, T.i8()) # memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty) @@ -154,19 +150,19 @@ def sequence(I, W, O): metadata="inOF_act_L3L2", bd_id=0, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, - sizes=[1, 1, 1, weightsInInt32s], + sizes=[1, 1, 1, weights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py index faafaf4d86..efd1b13555 100644 --- a/programming_examples/ml/conv2d_fused_relu/aie2.py +++ b/programming_examples/ml/conv2d_fused_relu/aie2.py @@ -25,14 +25,11 @@ actIn = width * in_channels # 32*64 = 2048 bufIn = actIn * 2 # double buffer -actInInt32s = actIn // 4 weights = in_channels * out_channels -weightsInInt32s = weights // 4 actOut = width * out_channels # 32*64 = 2048 bufOut = actOut * 2 # double buffer -actOutInt32s = actOut // 4 enableTrace = False trace_size = 16384 @@ -148,9 +145,8 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * in_channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) - memRef_wts_ty = T.memref(weightsInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) + memRef_wts_ty = T.memref(weights, T.i8()) # memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty) @@ -203,7 +199,7 @@ def sequence(I, W, O): # out to host DDR memory trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory output_size = bufOut - npu_writebd_shimtile( + npu_writebd( bd_id=trace_bd_id, buffer_length=trace_size, buffer_offset=output_size, @@ -240,19 +236,19 @@ def sequence(I, W, O): metadata="inOF_act_L3L2", bd_id=0, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, - sizes=[1, 1, 1, weightsInInt32s], + sizes=[1, 1, 1, weights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py index 354e9f78d1..4d0716fa1c 100644 --- a/programming_examples/ml/eltwise_add/aie2.py +++ b/programming_examples/ml/eltwise_add/aie2.py @@ -21,10 +21,6 @@ def my_eltwise_add(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - B_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -129,7 +125,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): @@ -143,15 +139,9 @@ def sequence(A, B, C): offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp index e595bf4a4e..4e918601d3 100644 --- a/programming_examples/ml/eltwise_add/test.cpp +++ b/programming_examples/ml/eltwise_add/test.cpp @@ -143,14 +143,14 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); // Assumes trace will only be added to inout2 auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -209,8 +209,9 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1, + bo_inout2); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py index 5808d0c998..4966ecd06e 100644 --- a/programming_examples/ml/eltwise_mul/aie2.py +++ b/programming_examples/ml/eltwise_mul/aie2.py @@ -21,10 +21,6 @@ def my_eltwise_mul(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - B_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -130,7 +126,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): @@ -144,15 +140,9 @@ def sequence(A, B, C): offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/eltwise_mul/test.cpp b/programming_examples/ml/eltwise_mul/test.cpp index a80eabf2cd..6c4f49bf5a 100644 --- a/programming_examples/ml/eltwise_mul/test.cpp +++ b/programming_examples/ml/eltwise_mul/test.cpp @@ -143,14 +143,14 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); // Assumes trace will only be added to inout2 auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -209,8 +209,9 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1, + bo_inout2); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py index e4da4eafdf..2d62135f27 100644 --- a/programming_examples/ml/relu/aie2.py +++ b/programming_examples/ml/relu/aie2.py @@ -21,9 +21,6 @@ def my_relu(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -105,7 +102,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): @@ -118,12 +115,8 @@ def sequence(A, C): size=trace_size, offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/relu/test.cpp b/programming_examples/ml/relu/test.cpp index 170d90d9fd..361ed7da10 100644 --- a/programming_examples/ml/relu/test.cpp +++ b/programming_examples/ml/relu/test.cpp @@ -105,11 +105,11 @@ int main(int argc, const char *argv[]) { // Initialize input/ output buffer sizes and sync them // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inout1 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); // Assumes trace will only be added to inout1 if (verbosity >= 1) @@ -161,7 +161,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile index 4b40c07da9..ff2361ba48 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/Makefile +++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile @@ -19,8 +19,6 @@ all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2 build/${mlirFileName}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} python3 $< > $@ -insts.txt: build/${mlirFileName}.mlir - aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< build/conv2dk1_i8.o: conv2dk1.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ @@ -41,9 +39,7 @@ build/final.xclbin: build/${mlirFileName}.mlir build/conv2dk1_i8.o build/conv2dk cd build && aiecc.py --basic-alloc-scheme --aie-generate-cdo --aie-generate-npu --no-compile-host \ --xclbin-name=${@F} --npu-insts-name=insts.txt ${= 1) @@ -171,7 +171,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inout0, bo_inout1); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py index 19e4e04ca9..9a66785bbb 100644 --- a/programming_examples/vision/color_detect/aie2_colorDetect.py +++ b/programming_examples/vision/color_detect/aie2_colorDetect.py @@ -22,11 +22,9 @@ lineWidth = width lineWidthInBytes = width * 4 -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 +traceSize = 1024 def color_detect(): @@ -242,8 +240,7 @@ def coreBody(): # To/from AIE-array data movement tensorSize = width * height * 4 # 4 channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = MemRefType.get((tensorSizeInInt32s,), T.i32()) + tensor_ty = MemRefType.get((tensorSize,), T.i8()) memRef_16x16_ty = MemRefType.get( ( 16, @@ -258,13 +255,13 @@ def sequence(I, B, O): metadata="inOF_L3L2", bd_id=1, mem=I, - sizes=[1, 1, 1, height * lineWidthInInt32s], + sizes=[1, 1, 1, height * lineWidthInBytes], ) npu_dma_memcpy_nd( metadata="outOF_L2L3", bd_id=0, mem=O, - sizes=[1, 1, 1, height * lineWidthInInt32s], + sizes=[1, 1, 1, height * lineWidthInBytes], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/color_detect/test.cpp b/programming_examples/vision/color_detect/test.cpp index f76c9fb7c2..d717324331 100644 --- a/programming_examples/vision/color_detect/test.cpp +++ b/programming_examples/vision/color_detect/test.cpp @@ -152,12 +152,12 @@ int main(int argc, const char *argv[]) { **************************************************************************** */ auto boInstr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto boInA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto boInB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto boInB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto boOut = xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -180,7 +180,8 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(boInstr, instr_v.size(), boInA, boInB, boOut); + unsigned int opcode = 3; + auto run = kernel(opcode, boInstr, instr_v.size(), boInA, boInB, boOut); run.wait(); // Sync device to host memories @@ -269,7 +270,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(boInstr, instr_v.size(), boInA, boInB, boOut); + unsigned int opcode = 3; + auto run = kernel(opcode, boInstr, instr_v.size(), boInA, boInB, boOut); run.wait(); // Sync device to host memories diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index 1215a4ddd0..fa067226dc 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -247,12 +247,11 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height - tensorSizeInInt32s = tensorSize // 4 @FuncOp.from_py_func( - T.memref(tensorSizeInInt32s, T.i32()), + T.memref(tensorSize, T.i8()), T.memref(32, T.i32()), # not used - T.memref(tensorSizeInInt32s, T.i32()), + T.memref(tensorSize, T.i8()), ) def sequence(inTensor, notUsed, outTensor): # thresholdValue, maxValue, thresholdType @@ -276,13 +275,13 @@ def sequence(inTensor, notUsed, outTensor): metadata="inOOB_L3L2", bd_id=1, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOOB_L2L3", bd_id=0, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/color_threshold/test.cpp b/programming_examples/vision/color_threshold/test.cpp index c9f4dfc5df..04fcbe8db7 100644 --- a/programming_examples/vision/color_threshold/test.cpp +++ b/programming_examples/vision/color_threshold/test.cpp @@ -99,13 +99,13 @@ int main(int argc, const char *argv[]) { */ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_in = - xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto debug = xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto debug = + xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_out = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -128,7 +128,8 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_in, debug, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, debug, bo_out); run.wait(); // Sync device to host memories diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 1af069d94e..3e095e356d 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -22,7 +22,6 @@ heightMinus1 = height - 1 lineWidth = width lineWidthInBytes = width * 4 -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False traceSizeInBytes = 8192 @@ -294,8 +293,7 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * 4 # 4 channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty) @@ -304,13 +302,13 @@ def sequence(I, B, O): metadata="outOF_L2L3", bd_id=0, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_L3L2", bd_id=1, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/edge_detect/test.cpp b/programming_examples/vision/edge_detect/test.cpp index 87018cb2b6..cb8d7287d3 100644 --- a/programming_examples/vision/edge_detect/test.cpp +++ b/programming_examples/vision/edge_detect/test.cpp @@ -188,14 +188,14 @@ int main(int argc, const char *argv[]) { **************************************************************************** */ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inB = - xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_out = xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -218,7 +218,8 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); // Sync device to host memories @@ -306,7 +307,9 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); // Sync device to host memories diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py index 8d568af388..35e1c5f515 100644 --- a/programming_examples/vision/vision_passthrough/aie2.py +++ b/programming_examples/vision/vision_passthrough/aie2.py @@ -19,7 +19,6 @@ height = int(sys.argv[2]) lineWidthInBytes = width -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False traceSizeInBytes = 8192 @@ -68,8 +67,7 @@ def core_body(): # print(ctx.module.operation.verify()) tensorSize = width * height - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(inTensor, notUsed, outTensor): @@ -157,13 +155,13 @@ def sequence(inTensor, notUsed, outTensor): metadata="in", bd_id=1, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="out", bd_id=0, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/vision_passthrough/test.cpp b/programming_examples/vision/vision_passthrough/test.cpp index 7eaa17f43f..f33fc64c22 100644 --- a/programming_examples/vision/vision_passthrough/test.cpp +++ b/programming_examples/vision/vision_passthrough/test.cpp @@ -95,13 +95,13 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, inImageGray.total() * inImageGray.elemSize(), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_inB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_out = xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -124,7 +124,8 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); // Sync device to host memories diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp b/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp index c14ca0f131..8a2e5e0d37 100644 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp @@ -133,13 +133,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -158,7 +158,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp index ba0e2b954d..513c296af4 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp @@ -134,13 +134,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -159,7 +159,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp b/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp index 19936b2da5..9a854eedd3 100644 --- a/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp +++ b/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp @@ -134,13 +134,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -159,7 +159,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_guide/section-2/section-2e/05_join_L2/test.cpp b/programming_guide/section-2/section-2e/05_join_L2/test.cpp index c14ca0f131..8a2e5e0d37 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/test.cpp +++ b/programming_guide/section-2/section-2e/05_join_L2/test.cpp @@ -133,13 +133,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -158,7 +158,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_guide/section-3/test.cpp b/programming_guide/section-3/test.cpp index 0698905f19..8ed5ccf57d 100644 --- a/programming_guide/section-3/test.cpp +++ b/programming_guide/section-3/test.cpp @@ -56,13 +56,13 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -93,7 +93,9 @@ int main(int argc, const char *argv[]) { // Execute the kernel and wait to finish if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); // Sync device to host memories diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py index cb57c1e4c6..b422ccd242 100644 --- a/programming_guide/section-3/test.py +++ b/programming_guide/section-3/test.py @@ -53,10 +53,10 @@ def main(opts): # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5)) # Initialize instruction buffer bo_instr.write(instr_v, 0) @@ -87,7 +87,8 @@ def main(opts): # Run kernel if opts.verbosity >= 1: print("Running Kernel.") - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + opcode = 3 + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) h.wait() bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) diff --git a/programming_guide/section-4/section-4a/README.md b/programming_guide/section-4/section-4a/README.md index 49d19564e1..4c04e51c06 100644 --- a/programming_guide/section-4/section-4a/README.md +++ b/programming_guide/section-4/section-4a/README.md @@ -24,7 +24,8 @@ Adding the application timer is as simple as noting a start and stop time surrou ```c++ auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); diff --git a/programming_guide/section-4/section-4a/test.cpp b/programming_guide/section-4/section-4a/test.cpp index a5af1576bf..e99f8ae77b 100644 --- a/programming_guide/section-4/section-4a/test.cpp +++ b/programming_guide/section-4/section-4a/test.cpp @@ -59,13 +59,13 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -112,7 +112,9 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py index be6cc7dc0b..d4d47cd918 100644 --- a/programming_guide/section-4/section-4a/test.py +++ b/programming_guide/section-4/section-4a/test.py @@ -53,10 +53,10 @@ def main(opts): # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5)) # Initialize instruction buffer bo_instr.write(instr_v, 0) @@ -92,7 +92,8 @@ def main(opts): if opts.verbosity >= 1: print("Running Kernel.") start = time.time_ns() - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + opcode = 3 + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) h.wait() stop = time.time_ns() bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) diff --git a/programming_guide/section-4/section-4b/test.cpp b/programming_guide/section-4/section-4b/test.cpp index 4e27fd8780..8d066a2b0e 100644 --- a/programming_guide/section-4/section-4b/test.cpp +++ b/programming_guide/section-4/section-4b/test.cpp @@ -62,13 +62,13 @@ int main(int argc, const char *argv[]) { // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = - xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_outC = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -115,7 +115,9 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py index b8ada47d6c..01330cdc5e 100644 --- a/programming_guide/section-4/section-4b/test.py +++ b/programming_guide/section-4/section-4b/test.py @@ -54,10 +54,10 @@ def main(opts): # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4)) + bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5)) # Initialize instruction buffer bo_instr.write(instr_v, 0) @@ -93,7 +93,8 @@ def main(opts): if opts.verbosity >= 1: print("Running Kernel.") start = time.time_ns() - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + opcode = 3 + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) h.wait() stop = time.time_ns() bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index be72ee3363..468a8a6fcf 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -110,31 +110,43 @@ def emit_design_kernel_json( buffer_args=None, ): if buffer_args is None: - buffer_args = ["in", "tmp", "out"] + buffer_args = [f"bo{i}" for i in range(5)] arguments = [ + { + "name": "opcode", + "address-qualifier": "SCALAR", + "type": "uint64_t", + "offset": "0x00", + }, + ] + offset = 0x08 + + inst_arguments = [ { "name": "instr", "memory-connection": "SRAM", "address-qualifier": "GLOBAL", "type": "char *", - "offset": "0x00", + "offset": str(hex(offset)), }, { "name": "ninstr", "address-qualifier": "SCALAR", - "type": "uint64_t", - "offset": "0x08", + "type": "uint32_t", + "offset": str(hex(offset + 8)), }, ] + arguments.append(inst_arguments[0]) + arguments.append(inst_arguments[1]) + offset += 12 - offset = 0x10 for buf in buffer_args: arg = { "name": buf, "memory-connection": "HOST", "address-qualifier": "GLOBAL", - "type": "char *", + "type": "void*", "offset": str(hex(offset)), } arguments.append(arg) @@ -148,7 +160,7 @@ def emit_design_kernel_json( "type": "dpu", "extended-data": { "subtype": "DPU", - "functional": "1", + "functional": "0", "dpu_kernel_id": kernel_id, }, "arguments": arguments, @@ -570,7 +582,7 @@ async def process_xclbin_gen(self): self.prepend_tmp("aie_partition.json"), ) - buffer_arg_names = [f"bo{i}" for i in range(6)] + buffer_arg_names = [f"bo{i}" for i in range(5)] await write_file_async( json.dumps( emit_design_kernel_json( diff --git a/python/utils/README.md b/python/utils/README.md index 211386adaa..b0ab9138dd 100644 --- a/python/utils/README.md +++ b/python/utils/README.md @@ -87,7 +87,7 @@ Test/ Host code utilities. To better appreciate what this wrapper function does, we need to delve more deeply into the details on how trace units are configured. ### Configure tile trace settings -Within the `func.func @sequence` block, we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd_shimtile`) to configure the shimDMA. +Within the `func.func @sequence` block, we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA. For a give AIE2 tile, we configure the trace control registers for the tile core and tile memory separately. There are 4 registers we generally use to configure the trace unit behavior. 2 are for configuring the general trace control and the other 2 are to specify which events our tile's trace hardware is monitoring. @@ -262,46 +262,46 @@ An example ddr_id to inout buffer mapping is below: in C/C++ ```c++ -aiex.npu.writebd_shimtile { bd_id = 3 : i32, - buffer_length = 16384 : i32, - buffer_offset = 262144 : i32, - enable_packet = 0 : i32, - out_of_order_id = 0 : i32, - packet_id = 0 : i32, - packet_type = 0 : i32, - column = 0 : i32, - column_num = 1 : i32, - d0_stepsize = 0 : i32, - d0_size = 0 : i32, - d0_stride = 0 : i32, - d0_wrap = 0 : i32, - d1_stepsize = 0 : i32, - d1_wrap = 0 : i32, - d1_size = 0 : i32, - d1_stride = 0 : i32, - d2_stepsize = 0 : i32, - d2_size = 0 : i32, - d2_stride = 0 : i32, - ddr_id = 2 : i32, - iteration_current = 0 : i32, - iteration_stepsize = 0 : i32, - iteration_wrap = 0 : i32, - iteration_size = 0 : i32, - iteration_stride = 0 : i32, - lock_acq_enable = 0 : i32, - lock_acq_id = 0 : i32, - lock_acq_val = 0 : i32, - lock_rel_id = 0 : i32, - lock_rel_val = 0 : i32, - next_bd = 0 : i32, - use_next_bd = 0 : i32, - valid_bd = 1 : i32} +aiex.npu.writebd { bd_id = 3 : i32, + buffer_length = 16384 : i32, + buffer_offset = 262144 : i32, + enable_packet = 0 : i32, + out_of_order_id = 0 : i32, + packet_id = 0 : i32, + packet_type = 0 : i32, + column = 0 : i32, + d0_stepsize = 0 : i32, + d0_size = 0 : i32, + d0_stride = 0 : i32, + d0_wrap = 0 : i32, + d1_stepsize = 0 : i32, + d1_wrap = 0 : i32, + d1_size = 0 : i32, + d1_stride = 0 : i32, + d2_stepsize = 0 : i32, + d2_size = 0 : i32, + d2_stride = 0 : i32, + ddr_id = 2 : i32, + iteration_current = 0 : i32, + iteration_stepsize = 0 : i32, + iteration_wrap = 0 : i32, + iteration_size = 0 : i32, + iteration_stride = 0 : i32, + lock_acq_enable = 0 : i32, + lock_acq_id = 0 : i32, + lock_acq_val = 0 : i32, + lock_rel_id = 0 : i32, + lock_rel_val = 0 : i32, + next_bd = 0 : i32, + row = 0 : i32, + use_next_bd = 0 : i32, + valid_bd = 1 : i32} // Set start BD to our shim bd_Id (3) aiex.npu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 } ``` in Python ```python -npu_writebd_shimtile( +npu_writebd( bd_id=3, buffer_length=16384, buffer_offset=262144, diff --git a/python/utils/trace.py b/python/utils/trace.py index 510d7feae7..743878fae6 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -147,7 +147,7 @@ def slave(port): # Configure a buffer descriptor to write tracing information that has been routed into this shim tile # out to host DDR memory - npu_writebd_shimtile( + npu_writebd( bd_id=bd_id, buffer_length=size, buffer_offset=offset, @@ -156,7 +156,6 @@ def slave(port): packet_id=0, packet_type=0, column=int(shim.col), - column_num=1, d0_size=0, d0_stride=0, d1_size=0, @@ -172,6 +171,7 @@ def slave(port): lock_rel_id=0, lock_rel_val=0, next_bd=0, + row=0, use_next_bd=0, valid_bd=1, ) diff --git a/python/utils/xrt.py b/python/utils/xrt.py index c5df0b66ca..7319edd872 100644 --- a/python/utils/xrt.py +++ b/python/utils/xrt.py @@ -36,7 +36,7 @@ def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"): insts = read_insts(insts_path) self.n_insts = len(insts) self.insts_buffer = AIE_Buffer( - self, 0, insts.dtype, insts.shape, xrt.bo.cacheable + self, 1, insts.dtype, insts.shape, xrt.bo.cacheable ) self.insts_buffer.write(insts) @@ -46,10 +46,14 @@ def register_buffer(self, group_id, *args, **kwargs): def run(self): self.insts_buffer.sync_to_device() h = self.call() - h.wait() + r = h.wait() + if r != xrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED: + raise Exception(f"Kernel returned {r}") def call(self): + opcode = 3 h = self.kernel( + opcode, self.insts_buffer.bo, self.n_insts * 4, *[b.bo for b in self.buffers if b is not None], @@ -129,13 +133,13 @@ def setup_aie( trace_size=16384, ): app = AIE_Application(xclbin_path, insts_path, kernel_name) - app.register_buffer(2, shape=in_0_shape, dtype=in_0_dtype) - app.register_buffer(3, shape=in_1_shape, dtype=in_1_dtype) + app.register_buffer(3, shape=in_0_shape, dtype=in_0_dtype) + app.register_buffer(4, shape=in_1_shape, dtype=in_1_dtype) if enable_trace: out_buf_len_bytes = np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize out_buf_shape = (out_buf_len_bytes + trace_size,) out_buf_dtype = np.uint8 - app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype) + app.register_buffer(5, shape=out_buf_shape, dtype=out_buf_dtype) return app @@ -156,7 +160,7 @@ def write_out_trace(trace, file_name): def execute(app, ifm_mem_fmt, total_wts): - app.buffers[2].write(ifm_mem_fmt) # input's standard format CYX | scalar YCX - app.buffers[3].write(total_wts) # wts's standard format OIYX | scalar OIYX + app.buffers[3].write(ifm_mem_fmt) # input's standard format CYX | scalar YCX + app.buffers[4].write(total_wts) # wts's standard format OIYX | scalar OIYX app.run() - return app.buffers[4].read() + return app.buffers[5].read() diff --git a/test/Conversion/AIEVecToLLVM/test-max.mlir b/test/Conversion/AIEVecToLLVM/test-max.mlir index 59d4f49bcd..033ec75e71 100644 --- a/test/Conversion/AIEVecToLLVM/test-max.mlir +++ b/test/Conversion/AIEVecToLLVM/test-max.mlir @@ -7,7 +7,7 @@ func.func @i8_max(%arg0 : vector<64xi8>) -> vector<64xi8> { // CHECK-LABEL: @i8_max // CHECK-SAME: %[[ARG0:.*]]: vector<64xi8> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt8"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<64xi8>, vector<64xi8>, i32) -> !llvm.struct<(vector<64xi8>, vector<2xi32>)> @@ -23,7 +23,7 @@ func.func @i16_max(%arg0 : vector<32xi16>) -> vector<32xi16> { // CHECK-LABEL: @i16_max // CHECK-SAME: %[[ARG0:.*]]: vector<32xi16> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt16"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<32xi16>, vector<32xi16>, i32) -> !llvm.struct<(vector<32xi16>, i32)> @@ -39,7 +39,7 @@ func.func @i32_max(%arg0 : vector<16xi32>) -> vector<16xi32> { // CHECK-LABEL: @i32_max // CHECK-SAME: %[[ARG0:.*]]: vector<16xi32> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMAX:.*]] = "xllvm.intr.aie2.vmax.lt32"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<16xi32>, vector<16xi32>, i32) -> !llvm.struct<(vector<16xi32>, i32)> diff --git a/test/Conversion/AIEVecToLLVM/test-min.mlir b/test/Conversion/AIEVecToLLVM/test-min.mlir index 4930d639c9..595d759438 100644 --- a/test/Conversion/AIEVecToLLVM/test-min.mlir +++ b/test/Conversion/AIEVecToLLVM/test-min.mlir @@ -7,7 +7,7 @@ func.func @i8_min(%arg0 : vector<64xi8>) -> vector<64xi8> { // CHECK-LABEL: @i8_min // CHECK-SAME: %[[ARG0:.*]]: vector<64xi8> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge8"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<64xi8>, vector<64xi8>, i32) -> !llvm.struct<(vector<64xi8>, vector<2xi32>)> @@ -23,7 +23,7 @@ func.func @i16_min(%arg0 : vector<32xi16>) -> vector<32xi16> { // CHECK-LABEL: @i16_min // CHECK-SAME: %[[ARG0:.*]]: vector<32xi16> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge16"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<32xi16>, vector<32xi16>, i32) -> !llvm.struct<(vector<32xi16>, i32)> @@ -39,7 +39,7 @@ func.func @i32_min(%arg0 : vector<16xi32>) -> vector<16xi32> { // CHECK-LABEL: @i32_min // CHECK-SAME: %[[ARG0:.*]]: vector<16xi32> -// CHECK: %[[CST:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[VMIN:.*]] = "xllvm.intr.aie2.vmin.ge32"( // CHECK-SAME: %[[ARG0]], %[[ARG0]], %[[CST]]) : // CHECK-SAME: (vector<16xi32>, vector<16xi32>, i32) -> !llvm.struct<(vector<16xi32>, i32)> diff --git a/test/Conversion/DmaToNpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir index fe2d61a08a..82df23f5e2 100644 --- a/test/Conversion/DmaToNpu/aiert_insts.mlir +++ b/test/Conversion/DmaToNpu/aiert_insts.mlir @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// // RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s -// CHECK: aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32} -// CHECK: aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32} module { diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir new file mode 100644 index 0000000000..6f0ed03057 --- /dev/null +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir @@ -0,0 +1,29 @@ +//===- bad_dma_to_npu.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +// CHECK: error: 'aiex.npu.dma_memcpy_nd' op Minimum data transfer size required is 32bits. + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<1xbf16> + func.func @sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir new file mode 100644 index 0000000000..bb4af49938 --- /dev/null +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir @@ -0,0 +1,29 @@ +//===- bad_dma_to_npu_datatype.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +// CHECK: error: 'aiex.npu.dma_memcpy_nd' op Maximum element bit width allowed is 32bits. + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<65536xi64> + func.func @sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/Conversion/DmaToNpu/dma_to_npu.mlir b/test/Conversion/DmaToNpu/dma_to_npu.mlir index 151f1351ec..f622375887 100644 --- a/test/Conversion/DmaToNpu/dma_to_npu.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu.mlir @@ -12,10 +12,10 @@ // TODO - more // CHECK-LABEL: dma_memcpy_nd_0 -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 1 : i32 module { aie.device(npu1_4col) { @@ -34,9 +34,10 @@ module { // ----- // CHECK-LABEL: dma_wait_s2mm -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 +// CHECK: aiex.npu.address_patch // CHECK: aiex.npu.write32 // CHECK: aiex.npu.sync // CHECK-SAME: channel = 0 : i32 @@ -60,7 +61,7 @@ module { // ----- // CHECK-LABEL: dma_wait_mm2s -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 // CHECK: aiex.npu.write32 diff --git a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir index b1f80ee241..6f35cd04fe 100644 --- a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir @@ -12,12 +12,12 @@ // TODO - more // CHECK-LABEL: test1 -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 // CHECK: aiex.npu.write32 // CHECK-SAME: value = 2147483649 -// CHECK: aiex.npu.writebd_shimtile +// CHECK: aiex.npu.writebd // CHECK-SAME: ddr_id = 1 : i32 // CHECK: aiex.npu.write32 // CHECK-SAME: value = 0 diff --git a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir new file mode 100644 index 0000000000..af75cd8b33 --- /dev/null +++ b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir @@ -0,0 +1,40 @@ +//===- dma_to_npu_width_conversion.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +//CHECK-LABEL: aie.device(xcve2302) { +//CHECK: memref.global "public" @toMem : memref<65536xbf16> +//CHECK: func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) { +//CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 8192 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 32 : i32, d0_stride = 0 : i32, d1_size = 64 : i32, d1_stride = 127 : i32, d2_stride = 31 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +//CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} +//CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147680256 : ui32} +//CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +//CHECK: return +//CHECK: } +//CHECK: aie.shim_dma_allocation @toMem(S2MM, 0, 0) +//CHECK: } + + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<65536xbf16> + func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/Conversion/DmaToNpu/push_to_queue.mlir b/test/Conversion/DmaToNpu/push_to_queue.mlir index c9b4361ca2..4f9adad9d7 100644 --- a/test/Conversion/DmaToNpu/push_to_queue.mlir +++ b/test/Conversion/DmaToNpu/push_to_queue.mlir @@ -12,14 +12,10 @@ module { aie.device(npu1_4col) { - memref.global "public" @toMem : memref<32xi32> - memref.global "public" @fromMem : memref<32xi32> func.func @sequence() { - aiex.npu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } - aiex.npu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } + aiex.npu.push_queue (0, 0, S2MM:1) {issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } + aiex.npu.push_queue (2, 0, MM2S:0) {issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } return } - aie.shim_dma_allocation @fromMem (MM2S, 0, 2) - aie.shim_dma_allocation @toMem (S2MM, 1, 0) } } diff --git a/test/Targets/NPU/npu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir index 299311bfae..847a51dbaf 100644 --- a/test/Targets/NPU/npu_instgen.mlir +++ b/test/Targets/NPU/npu_instgen.mlir @@ -13,24 +13,11 @@ module { aie.device(npu1_4col) { func.func @test0(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { - // look for the prolog. - // CHECK: 00000011 - // CHECK-NEXT: 01000405 - // CHECK-NEXT: 01000100 - // CHECK-NEXT: 0B590100 - // CHECK-NEXT: 000055FF - // CHECK-NEXT: 00000001 - // CHECK-NEXT: 00000010 - // CHECK-NEXT: 314E5A5F - // CHECK-NEXT: 635F5F31 - // CHECK-NEXT: 676E696C - // CHECK-NEXT: 39354E5F - // CHECK-NEXT: 6E693131 - // CHECK-NEXT: 5F727473 - // CHECK-NEXT: 64726F77 - // CHECK-NEXT: 00004573 - // CHECK-NEXT: 07BD9630 - // CHECK-NEXT: 000055FF + // TXN header + // CHECK: 06030100 + // CHECK: 00000105 + // CHECK: 00000003 + // CHECK: 00000068 %c16_i64 = arith.constant 16 : i64 %c1_i64 = arith.constant 1 : i64 @@ -38,8 +25,10 @@ module { %c64_i64 = arith.constant 64 : i64 %c0_i32 = arith.constant 0 : i32 %c1_i32 = arith.constant 1 : i32 - // CHECK: 060304A6 + // CHECK: 00000001 // CHECK: 00000000 + // CHECK: 0601D0C0 + // CHECK: 00000030 // CHECK: 00000001 // CHECK: 00000002 // CHECK: 00000000 @@ -48,37 +37,40 @@ module { // CHECK: 00000009 // CHECK: 2CD0000C // CHECK: 2E107041 - aiex.npu.writebd_shimtile { bd_id = 6 : i32, - buffer_length = 1 : i32, - buffer_offset = 2 : i32, - enable_packet = 0 : i32, - out_of_order_id = 0 : i32, - packet_id = 0 : i32, - packet_type = 0 : i32, - column = 3 : i32, - column_num = 4 : i32, - d0_stride = 5 : i32, - d0_size = 6 : i32, - d1_stride = 7 : i32, - d1_size = 8 : i32, - d2_stride = 9 : i32, - ddr_id = 10 : i32, - iteration_current = 11 : i32, - iteration_stride = 12 : i32, - iteration_size = 13 : i32, - lock_acq_enable = 1 : i32, - lock_acq_id = 1 : i32, - lock_acq_val = 2 : i32, - lock_rel_id = 3 : i32, - lock_rel_val = 4 : i32, - next_bd = 5 : i32, - use_next_bd = 1 : i32, - valid_bd = 1 : i32} - // CHECK: 02030400 - // CHECK: ABC00DEF + aiex.npu.writebd { bd_id = 6 : i32, + buffer_length = 1 : i32, + buffer_offset = 2 : i32, + enable_packet = 0 : i32, + out_of_order_id = 0 : i32, + packet_id = 0 : i32, + packet_type = 0 : i32, + column = 3 : i32, + row = 0 : i32, + d0_stride = 5 : i32, + d0_size = 6 : i32, + d1_stride = 7 : i32, + d1_size = 8 : i32, + d2_stride = 9 : i32, + ddr_id = 10 : i32, + iteration_current = 11 : i32, + iteration_stride = 12 : i32, + iteration_size = 13 : i32, + lock_acq_enable = 1 : i32, + lock_acq_id = 1 : i32, + lock_acq_val = 2 : i32, + lock_rel_id = 3 : i32, + lock_rel_val = 4 : i32, + next_bd = 5 : i32, + use_next_bd = 1 : i32, + valid_bd = 1 : i32} + // CHECK: 00000000 + // CHECK: 00000000 + // CHECK: 06400DEF + // CHECK: 00000000 // CHECK: 00000042 aiex.npu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 } - // CHECK: 03030401 + + // CHECK: 00030401 // CHECK: 05010200 aiex.npu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 } return diff --git a/test/aie2xclbin/buffers_xclbin.mlir b/test/aie2xclbin/buffers_xclbin.mlir index 26709109a9..88d6ba9d4f 100644 --- a/test/aie2xclbin/buffers_xclbin.mlir +++ b/test/aie2xclbin/buffers_xclbin.mlir @@ -19,64 +19,63 @@ // CHECK: { // CHECK: "arguments": [ // CHECK: { +// CHECK: "address-qualifier": "SCALAR", +// CHECK: "name": "opcode", +// CHECK: "offset": "0x00", +// CHECK: "type": "uint64_t" +// CHECK: }, +// CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "SRAM", // CHECK: "name": "instr", -// CHECK: "offset": "0x00", +// CHECK: "offset": "0x08", // CHECK: "type": "char *" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "SCALAR", // CHECK: "name": "ninstr", -// CHECK: "offset": "0x08", -// CHECK: "type": "uint64_t" +// CHECK: "offset": "0x10", +// CHECK: "type": "uint32_t" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "HOST", // CHECK: "name": "bo0", -// CHECK: "offset": "0x10", -// CHECK: "type": "char *" +// CHECK: "offset": "0x14", +// CHECK: "type": "void*" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "HOST", // CHECK: "name": "bo1", -// CHECK: "offset": "0x18", -// CHECK: "type": "char *" +// CHECK: "offset": "0x1c", +// CHECK: "type": "void*" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "HOST", // CHECK: "name": "bo2", -// CHECK: "offset": "0x20", -// CHECK: "type": "char *" +// CHECK: "offset": "0x24", +// CHECK: "type": "void*" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "HOST", // CHECK: "name": "bo3", -// CHECK: "offset": "0x28", -// CHECK: "type": "char *" +// CHECK: "offset": "0x2c", +// CHECK: "type": "void*" // CHECK: }, // CHECK: { // CHECK: "address-qualifier": "GLOBAL", // CHECK: "memory-connection": "HOST", // CHECK: "name": "bo4", -// CHECK: "offset": "0x30", -// CHECK: "type": "char *" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo5", -// CHECK: "offset": "0x38", -// CHECK: "type": "char *" +// CHECK: "offset": "0x34", +// CHECK: "type": "void*" // CHECK: } // CHECK: ], // CHECK: "extended-data": { // CHECK: "dpu_kernel_id": "0x901", -// CHECK: "functional": "1", +// CHECK: "functional": "0", // CHECK: "subtype": "DPU" // CHECK: }, // CHECK: "instances": [ diff --git a/test/aiecc/buffers_xclbin.mlir b/test/aiecc/buffers_xclbin.mlir index 096c3ada31..a2d0fd380e 100644 --- a/test/aiecc/buffers_xclbin.mlir +++ b/test/aiecc/buffers_xclbin.mlir @@ -20,64 +20,63 @@ // CHECK: "type": "dpu", // CHECK: "extended-data": { // CHECK: "subtype": "DPU", -// CHECK: "functional": "1", +// CHECK: "functional": "0", // CHECK: "dpu_kernel_id": "0x901" // CHECK: }, // CHECK: "arguments": [ // CHECK: { +// CHECK: "name": "opcode", +// CHECK: "address-qualifier": "SCALAR", +// CHECK: "type": "uint64_t", +// CHECK: "offset": "0x00" +// CHECK: }, +// CHECK: { // CHECK: "name": "instr", // CHECK: "memory-connection": "SRAM", // CHECK: "address-qualifier": "GLOBAL", // CHECK: "type": "char *", -// CHECK: "offset": "0x00" +// CHECK: "offset": "0x8" // CHECK: }, // CHECK: { // CHECK: "name": "ninstr", // CHECK: "address-qualifier": "SCALAR", -// CHECK: "type": "uint64_t", -// CHECK: "offset": "0x08" +// CHECK: "type": "uint32_t", +// CHECK: "offset": "0x10" // CHECK: }, // CHECK: { // CHECK: "name": "bo0", // CHECK: "memory-connection": "HOST", // CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x10" +// CHECK: "type": "void*", +// CHECK: "offset": "0x14" // CHECK: }, // CHECK: { // CHECK: "name": "bo1", // CHECK: "memory-connection": "HOST", // CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x18" +// CHECK: "type": "void*", +// CHECK: "offset": "0x1c" // CHECK: }, // CHECK: { // CHECK: "name": "bo2", // CHECK: "memory-connection": "HOST", // CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x20" +// CHECK: "type": "void*", +// CHECK: "offset": "0x24" // CHECK: }, // CHECK: { // CHECK: "name": "bo3", // CHECK: "memory-connection": "HOST", // CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x28" +// CHECK: "type": "void*", +// CHECK: "offset": "0x2c" // CHECK: }, // CHECK: { // CHECK: "name": "bo4", // CHECK: "memory-connection": "HOST", // CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x30" -// CHECK: }, -// CHECK: { -// CHECK: "name": "bo5", -// CHECK: "memory-connection": "HOST", -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "type": "char *", -// CHECK: "offset": "0x38" +// CHECK: "type": "void*", +// CHECK: "offset": "0x34" // CHECK: } // CHECK: ], // CHECK: "instances": [ diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir index 7881d80cfe..c89587b3f1 100644 --- a/test/dialect/AIEX/bad_npu_nd.mlir +++ b/test/dialect/AIEX/bad_npu_nd.mlir @@ -66,17 +66,3 @@ module { // ----- -module { - aie.device(npu1_4col) { - func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { - %c0 = arith.constant 0 : i64 - %c1 = arith.constant 1 : i64 - %c1920 = arith.constant 1920 : i64 - %c1080 = arith.constant 1080 : i64 - // expected-error@+1 {{must be used with memref type with element width 32.}} - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> - return - } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) - } -} diff --git a/test/dialect/AIEX/bad_npu_push_queue.mlir b/test/dialect/AIEX/bad_npu_push_queue.mlir index 28d194d4ae..c6e66d37aa 100644 --- a/test/dialect/AIEX/bad_npu_push_queue.mlir +++ b/test/dialect/AIEX/bad_npu_push_queue.mlir @@ -15,10 +15,9 @@ module { aie.device(npu1_4col) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } + aiex.npu.push_queue (0, 0, MM2S:0) {issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } @@ -28,9 +27,8 @@ module { aie.device(npu1_4col) { func.func @bad_repeat_count(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Repeat count exceeds the [0:255] range.}} - aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } + aiex.npu.push_queue (0, 0, MM2S:0) {issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } \ No newline at end of file diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index a02a5787fc..37b0ada3f5 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -15,10 +15,9 @@ module { aie.device(npu1_4col) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } @@ -28,10 +27,9 @@ module { aie.device(npu1_4col) { func.func @bad_iteration_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } @@ -41,10 +39,9 @@ module { aie.device(npu1_4col) { func.func @bad_stride(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } @@ -54,9 +51,8 @@ module { aie.device(npu1_4col) { func.func @bad_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } } \ No newline at end of file diff --git a/test/dialect/AIEX/roundtrip.mlir b/test/dialect/AIEX/roundtrip.mlir index 2fcb94fa38..bbab413598 100644 --- a/test/dialect/AIEX/roundtrip.mlir +++ b/test/dialect/AIEX/roundtrip.mlir @@ -28,3 +28,12 @@ func.func @npu_dma_wait_no_device() { aiex.npu.dma_wait {symbol = @out0} return } + +// ----- + +// CHECK-LABEL: func.func @npu_addr_patch +// CHECK: aiex.npu.address_patch {addr = 123 : ui32, arg_idx = 3 : i32, arg_plus = 0 : i32} +func.func @npu_addr_patch() { + aiex.npu.address_patch {addr = 123 : ui32, arg_idx = 3 : i32, arg_plus = 0 : i32} + return +} \ No newline at end of file diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp index f473e20b44..5c2048546a 100644 --- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp +++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp @@ -69,13 +69,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); uint32_t *bufInA = bo_inA.map(); std::vector srcVecA; @@ -89,7 +89,8 @@ int main(int argc, const char *argv[]) { bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/add_314_using_dma_op/test.cpp b/test/npu-xrt/add_314_using_dma_op/test.cpp index b186b47056..f98179be32 100644 --- a/test/npu-xrt/add_314_using_dma_op/test.cpp +++ b/test/npu-xrt/add_314_using_dma_op/test.cpp @@ -69,13 +69,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); uint32_t *bufInA = bo_inA.map(); std::vector srcVecA; @@ -89,7 +89,8 @@ int main(int argc, const char *argv[]) { bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/add_one_objFifo/test.cpp b/test/npu-xrt/add_one_objFifo/test.cpp index a48ce210ed..a3e2795480 100644 --- a/test/npu-xrt/add_one_objFifo/test.cpp +++ b/test/npu-xrt/add_one_objFifo/test.cpp @@ -133,13 +133,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -158,7 +158,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/add_one_two/test.cpp b/test/npu-xrt/add_one_two/test.cpp index 0f2a5c3c7e..6a55930539 100644 --- a/test/npu-xrt/add_one_two/test.cpp +++ b/test/npu-xrt/add_one_two/test.cpp @@ -136,24 +136,24 @@ int main(int argc, const char *argv[]) { auto kernel0 = xrt::kernel(context, kernelName0); auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(1)); auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2)); - auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3)); - auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4)); + auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(5)); auto kernel1 = xrt::kernel(context, kernelName1); auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(1)); auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2)); - auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3)); - auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4)); + auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -173,7 +173,9 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel 0.\n"; - auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out); + unsigned int opcode = 3; + auto run0 = + kernel0(opcode, bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out); run0.wait(); bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); @@ -191,7 +193,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel 1.\n"; - auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out); + auto run1 = + kernel1(opcode, bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out); run1.wait(); bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/add_one_using_dma/test.cpp b/test/npu-xrt/add_one_using_dma/test.cpp index a48ce210ed..a3e2795480 100644 --- a/test/npu-xrt/add_one_using_dma/test.cpp +++ b/test/npu-xrt/add_one_using_dma/test.cpp @@ -133,13 +133,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -158,7 +158,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/cascade_flows/test.cpp b/test/npu-xrt/cascade_flows/test.cpp index 5f7567e3d0..db5cb25086 100644 --- a/test/npu-xrt/cascade_flows/test.cpp +++ b/test/npu-xrt/cascade_flows/test.cpp @@ -133,13 +133,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -158,7 +158,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir index 3112c0c05e..3746978016 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir @@ -577,7 +577,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -587,7 +587,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -597,7 +597,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -607,7 +607,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 6735 : ui32} // events:0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -617,7 +617,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -627,7 +627,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir index fb58fa0fb0..7618d49f30 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir @@ -425,7 +425,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -435,7 +435,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -445,7 +445,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -455,7 +455,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -465,7 +465,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -475,7 +475,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir index e6d4d7df97..79660c47f5 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir @@ -171,7 +171,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -181,7 +181,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir index 6ffc1cfda2..75a856a2c8 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir @@ -451,7 +451,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -461,7 +461,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -471,7 +471,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -481,7 +481,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 2 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -491,7 +491,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 2 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 2 : i32, row = 1 : i32, value = 589439264 : ui32} // [29:24] port3 S2MM-3, [21:16] port2 S2MM-2, [13:8] port1 S2MM-1, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 2 : i32, row = 1 : i32, value = 0 : ui32} // [5:0] port4 MM2S-0 - aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -501,7 +501,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -511,7 +511,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 9 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp index 3b26e0623a..70090933ef 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp +++ b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp @@ -107,13 +107,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_a = - xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_b = - xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY, - kernel.group_id(4)); + kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -157,7 +157,8 @@ int main(int argc, const char *argv[]) { std::cout << "Running Kernel.\n"; } auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/matrix_multiplication_using_dma/test.cpp b/test/npu-xrt/matrix_multiplication_using_dma/test.cpp index e3dd21b83e..1316c5d4d1 100644 --- a/test/npu-xrt/matrix_multiplication_using_dma/test.cpp +++ b/test/npu-xrt/matrix_multiplication_using_dma/test.cpp @@ -166,13 +166,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_a = - xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_b = - xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_c = - xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -203,7 +203,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); run.wait(); bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/two_col/test.cpp b/test/npu-xrt/two_col/test.cpp index 25e4da66af..ef26e7960b 100644 --- a/test/npu-xrt/two_col/test.cpp +++ b/test/npu-xrt/two_col/test.cpp @@ -136,13 +136,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_in = - xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto debug = xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto debug = + xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_out = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -160,7 +160,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_in, debug, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, debug, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/npu-xrt/vector_scalar_using_dma/test.cpp b/test/npu-xrt/vector_scalar_using_dma/test.cpp index 79d90f426f..3b055931fa 100644 --- a/test/npu-xrt/vector_scalar_using_dma/test.cpp +++ b/test/npu-xrt/vector_scalar_using_dma/test.cpp @@ -134,13 +134,13 @@ int main(int argc, const char *argv[]) { auto kernel = xrt::kernel(context, kernelName); auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -159,7 +159,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); run.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/test/objectFifo-stateful-transform/loop_test.aie.mlir b/test/objectFifo-stateful-transform/loop_test.aie.mlir index 5776be595a..b297b6290b 100644 --- a/test/objectFifo-stateful-transform/loop_test.aie.mlir +++ b/test/objectFifo-stateful-transform/loop_test.aie.mlir @@ -1,4 +1,4 @@ -//===- loop_test.aie.mlir --------------------------*- MLIR -*-===// +//===- loop_test.aie.mlir --------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,118 +13,118 @@ // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s // CHECK-LABEL: aie.device(xcvc1902) { -// CHECK: memref.global "public" @loop_of : memref<16xi32> -// CHECK: %[[VAL_0:.*]] = aie.tile(1, 2) -// CHECK: %[[VAL_1:.*]] = aie.tile(1, 3) -// CHECK: %[[VAL_2:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = "loop_of_buff_0"} : memref<16xi32> -// CHECK: %[[VAL_3:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = "loop_of_buff_1"} : memref<16xi32> -// CHECK: %[[VAL_4:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = "loop_of_buff_2"} : memref<16xi32> -// CHECK: %[[VAL_5:.*]] = aie.buffer(%[[VAL_0]]) {sym_name = "loop_of_buff_3"} : memref<16xi32> -// CHECK: %[[VAL_6:.*]] = aie.lock(%[[VAL_0]], 0) {init = 0 : i32, sym_name = "loop_of_lock_0"} -// CHECK: %[[VAL_7:.*]] = aie.lock(%[[VAL_0]], 1) {init = 0 : i32, sym_name = "loop_of_lock_1"} -// CHECK: %[[VAL_8:.*]] = aie.lock(%[[VAL_0]], 2) {init = 0 : i32, sym_name = "loop_of_lock_2"} -// CHECK: %[[VAL_9:.*]] = aie.lock(%[[VAL_0]], 3) {init = 0 : i32, sym_name = "loop_of_lock_3"} +// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) +// CHECK-DAG: %[[BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_0"} : memref<16xi32> +// CHECK-DAG: %[[BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_1"} : memref<16xi32> +// CHECK-DAG: %[[BUFF_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_2"} : memref<16xi32> +// CHECK-DAG: %[[BUFF_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_3"} : memref<16xi32> +// CHECK-DAG: %[[LOCK_0:.*]] = aie.lock(%[[TILE_1_2]], 0) {init = 0 : i32, sym_name = "loop_of_lock_0"} +// CHECK-DAG: %[[LOCK_1:.*]] = aie.lock(%[[TILE_1_2]], 1) {init = 0 : i32, sym_name = "loop_of_lock_1"} +// CHECK-DAG: %[[LOCK_2:.*]] = aie.lock(%[[TILE_1_2]], 2) {init = 0 : i32, sym_name = "loop_of_lock_2"} +// CHECK-DAG: %[[LOCK_3:.*]] = aie.lock(%[[TILE_1_2]], 3) {init = 0 : i32, sym_name = "loop_of_lock_3"} // CHECK: func.func @some_work(%[[VAL_10:.*]]: memref<16xi32>, %[[VAL_11:.*]]: index) { // CHECK: return // CHECK: } -// CHECK: %[[VAL_12:.*]] = aie.core(%[[VAL_0]]) { +// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { // CHECK: %[[VAL_13:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_14:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_15:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_16:.*]] = arith.constant 4 : index -// CHECK: %[[VAL_17:.*]] = arith.constant 21 : index -// CHECK: aie.use_lock(%[[VAL_6]], Acquire, 0) -// CHECK: func.call @some_work(%[[VAL_2]], %[[VAL_13]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_6]], Release, 1) -// CHECK: %[[VAL_18:.*]] = arith.constant 16 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C21:.*]] = arith.constant 21 : index +// CHECK: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_0]], %[[VAL_13]]) : (memref<16xi32>, index) -> () +// CHECK: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK: %[[C17:.*]] = arith.constant 17 : index // CHECK: %[[VAL_19:.*]] = arith.constant 8 : index -// CHECK: scf.for %[[VAL_20:.*]] = %[[VAL_14]] to %[[VAL_18]] step %[[VAL_19]] { -// CHECK: aie.use_lock(%[[VAL_7]], Acquire, 0) -// CHECK: func.call @some_work(%[[VAL_3]], %[[VAL_20]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_7]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_8]], Acquire, 0) -// CHECK: %[[VAL_21:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_22:.*]] = arith.addi %[[VAL_20]], %[[VAL_21]] : index -// CHECK: func.call @some_work(%[[VAL_4]], %[[VAL_22]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_8]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_9]], Acquire, 0) -// CHECK: %[[VAL_23:.*]] = arith.constant 4 : index -// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_20]], %[[VAL_23]] : index -// CHECK: func.call @some_work(%[[VAL_5]], %[[VAL_24]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_9]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_6]], Acquire, 0) -// CHECK: %[[VAL_25:.*]] = arith.constant 6 : index -// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_20]], %[[VAL_25]] : index -// CHECK: func.call @some_work(%[[VAL_2]], %[[VAL_26]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_6]], Release, 1) -// CHECK: } -// CHECK: aie.use_lock(%[[VAL_7]], Acquire, 0) -// CHECK: %[[VAL_27:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_18]], %[[VAL_27]] : index -// CHECK: func.call @some_work(%[[VAL_3]], %[[VAL_28]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_7]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_8]], Acquire, 0) -// CHECK: %[[VAL_29:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_30:.*]] = arith.addi %[[VAL_18]], %[[VAL_29]] : index -// CHECK: func.call @some_work(%[[VAL_4]], %[[VAL_30]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_8]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_9]], Acquire, 0) -// CHECK: %[[VAL_31:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_14]], %[[VAL_31]] : index -// CHECK: func.call @some_work(%[[VAL_5]], %[[VAL_32]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_9]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_6]], Acquire, 0) -// CHECK: %[[VAL_33:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_14]], %[[VAL_33]] : index -// CHECK: func.call @some_work(%[[VAL_2]], %[[VAL_34]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_6]], Release, 1) -// CHECK: aie.use_lock(%[[VAL_7]], Acquire, 0) -// CHECK: %[[VAL_35:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_36:.*]] = arith.addi %[[VAL_14]], %[[VAL_35]] : index -// CHECK: func.call @some_work(%[[VAL_3]], %[[VAL_36]]) : (memref<16xi32>, index) -> () -// CHECK: aie.use_lock(%[[VAL_7]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @loop { - aie.device(xcvc1902) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - - aie.objectfifo @loop_of (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo> - - func.func @some_work(%line_in:memref<16xi32>, %index:index) -> () { - return - } - - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %c21 = arith.constant 21 : index - - %subviewTop0 = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> - %elemTop0 = aie.objectfifo.subview.access %subviewTop0[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elemTop0, %c0) : (memref<16xi32>,index) -> () - aie.objectfifo.release @loop_of (Produce, 1) - - scf.for %indexInHeight = %c1 to %c21 step %c2 { - %subview = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0,%indexInHeight) : (memref<16xi32>,index) -> () - aie.objectfifo.release @loop_of (Produce, 1) - } - - scf.for %indexInHeight = %c1 to %c4 step %c1 { - %subview = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0,%indexInHeight) : (memref<16xi32>,index) -> () - aie.objectfifo.release @loop_of (Produce, 1) - } - - aie.end - } +// CHECK: scf.for %[[ARG0:.*]] = %[[C1]] to %[[C17]] step %[[VAL_19]] { +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: func.call @some_work(%[[BUFF_1]], %[[ARG0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[MUL_0:.*]] = arith.muli %[[C2]], %[[C1_1]] : index +// CHECK-DAG: %[[ADD_0:.*]] = arith.addi %[[ARG0]], %[[MUL_0]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_2]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_2]], %[[ADD_0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_2]], Release, 1) +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[MUL_1:.*]] = arith.muli %[[C2]], %[[C2_1]] : index +// CHECK-DAG: %[[ADD_1:.*]] = arith.addi %[[ARG0]], %[[MUL_1]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_3]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_3]], %[[ADD_1]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_3]], Release, 1) +// CHECK-DAG: %[[C3_1:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[MUL_2:.*]] = arith.muli %[[C2]], %[[C3_1]] : index +// CHECK-DAG: %[[ADD_2:.*]] = arith.addi %[[ARG0]], %[[MUL_2]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_0]], %[[ADD_2]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-NEXT: } +// CHECK: scf.for %[[ARG0:.+]] = %[[C17]] to %[[C21]] step %c2 { +// CHECK-DAG: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_1]], %[[ARG0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-NEXT: } +// CHECK: %[[C1_0:.+]] = arith.constant 1 : index +// CHECK: %[[C4_1:.+]] = arith.constant 4 : index +// CHECK: scf.for %[[ARG0:.+]] = %[[C1]] to %[[C1_0]] step %[[C4_1]] { +// CHECK-NEXT: aie.use_lock(%[[LOCK_2]], Acquire, 0) +// CHECK-NEXT: func.call @some_work(%[[BUFF_2]], %[[ARG0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_2]], Release, 1) +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[MUL_0:.*]] = arith.muli %[[C1]], %[[C1_2]] : index +// CHECK-DAG: %[[ADD_0:.*]] = arith.addi %[[ARG0]], %[[MUL_0]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_3]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_3]], %[[ADD_0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_3]], Release, 1) +// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[MUL_1:.*]] = arith.muli %[[C1]], %[[C2_3]] : index +// CHECK-DAG: %[[ADD_1:.*]] = arith.addi %[[ARG0]], %[[MUL_1]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_0]], %[[ADD_1]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[MUL_2:.*]] = arith.muli %[[C1]], %[[C3]] : index +// CHECK-DAG: %[[ADD_2:.*]] = arith.addi %[[ARG0]], %[[MUL_2]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_1]], %[[ADD_2]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-NEXT: } +// CHECK: scf.for %[[ARG0:.+]] = %[[C1_0]] to %[[C4]] step %[[C1]] { +// CHECK-DAG: aie.use_lock(%[[LOCK_2]], Acquire, 0) +// CHECK: func.call @some_work(%[[BUFF_2]], %[[ARG0]]) : (memref<16xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_2]], Release, 1) +// CHECK-NEXT: } +module { + aie.device(xcvc1902) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + aie.objectfifo @loop_of (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo> + func.func @some_work(%line_in:memref<16xi32>, %index:index) -> () { + return + } + %core12 = aie.core(%tile12) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c21 = arith.constant 21 : index + %subviewTop0 = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elemTop0 = aie.objectfifo.subview.access %subviewTop0[0] : !aie.objectfifosubview> -> memref<16xi32> + func.call @some_work(%elemTop0, %c0) : (memref<16xi32>,index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + scf.for %indexInHeight = %c1 to %c21 step %c2 { + %subview = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> + func.call @some_work(%elem0,%indexInHeight) : (memref<16xi32>,index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + } + scf.for %indexInHeight = %c1 to %c4 step %c1 { + %subview = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> + func.call @some_work(%elem0,%indexInHeight) : (memref<16xi32>,index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + } + aie.end } + } } diff --git a/test/objectFifo-stateful-transform/loop_test_nested.mlir b/test/objectFifo-stateful-transform/loop_test_nested.mlir new file mode 100644 index 0000000000..e176bb5d58 --- /dev/null +++ b/test/objectFifo-stateful-transform/loop_test_nested.mlir @@ -0,0 +1,137 @@ +//===- loop_test_nested.mlir -----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK-LABEL: aie.device(xcvc1902) { +// CHECK: memref.global "public" @loop_of : memref<16xi32> +// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) +// CHECK-DAG: %[[BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_0"} : memref<16xi32> +// CHECK-DAG: %[[BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "loop_of_buff_1"} : memref<16xi32> +// CHECK-DAG: %[[LOCK_0:.*]] = aie.lock(%[[TILE_1_2]], 0) {init = 0 : i32, sym_name = "loop_of_lock_0"} +// CHECK-DAG: %[[LOCK_1:.*]] = aie.lock(%[[TILE_1_2]], 1) {init = 0 : i32, sym_name = "loop_of_lock_1"} +// CHECK: func.func @some_work(%{{.+}}: memref<4x4xi32>, %{{.+}}: index) { +// CHECK: return +// CHECK: } +// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C21:.*]] = arith.constant 21 : index +// CHECK-DAG: %[[C4294967295:.*]] = arith.constant 4294967295 : index +// CHECK-DAG: %[[C4294967294:.*]] = arith.constant 4294967294 : index +// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4294967294]] step %[[C2_0]] { +// CHECK: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_0]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK: scf.for %[[ARG1:.+]] = %[[C1]] to %[[C21]] step %[[C2_4]] { +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_1]], %[[ARG1]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[MUL_0:.*]] = arith.muli %[[C1]], %[[C1_1]] : index +// CHECK-DAG: %[[ADD_0:.*]] = arith.addi %[[ARG1]], %[[MUL_0]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: %[[REINTERPRET_2:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_2]], %[[ADD_0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-NEXT: } +// CHECK: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_3:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_3]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_4:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_4]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK: scf.for %[[ARG1:.+]] = %[[C1]] to %[[C21]] step %[[C2_3]] { +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_5:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_5]], %[[ARG1]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[MUL_1:.*]] = arith.muli %[[C1]], %[[C1_1]] : index +// CHECK-DAG: %[[ADD_1:.*]] = arith.addi %[[ARG1]], %[[MUL_1]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: %[[REINTERPRET_6:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_6]], %[[ADD_1]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-NEXT: } +// CHECK: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_7:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_7]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-NEXT: } +// CHECK: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_8:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_8]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK: scf.for %[[ARG0:.+]] = %[[C1]] to %[[C21]] step %[[C2_4]] { +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_9:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_9]], %[[ARG0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[MUL_2:.*]] = arith.muli %[[C1]], %[[C1_4]] : index +// CHECK-DAG: %[[ADD_2:.*]] = arith.addi %[[ARG0]], %[[MUL_2]] : index +// CHECK-DAG: aie.use_lock(%[[LOCK_0]], Acquire, 0) +// CHECK: %[[REINTERPRET_10:.+]] = memref.reinterpret_cast %[[BUFF_0]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_10]], %[[ADD_1]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_0]], Release, 1) +// CHECK-NEXT: } +// CHECK: aie.use_lock(%[[LOCK_1]], Acquire, 0) +// CHECK-NEXT: %[[REINTERPRET_11:.+]] = memref.reinterpret_cast %[[BUFF_1]] to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> +// CHECK-NEXT: func.call @some_work(%[[REINTERPRET_11]], %[[C0]]) : (memref<4x4xi32>, index) -> () +// CHECK-NEXT: aie.use_lock(%[[LOCK_1]], Release, 1) +module { + aie.device(xcvc1902) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + aie.objectfifo @loop_of (%tile12, {%tile13}, 2 : i32) : !aie.objectfifo> + func.func @some_work(%line_in: memref<4x4xi32>, %index: index) -> () { + return + } + %core12 = aie.core(%tile12) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c21 = arith.constant 21 : index + %cmax = arith.constant 0xFFFFFFFF : index + scf.for %arg0 = %c0 to %cmax step %c1 { + %subviewTop0 = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elemTop0 = aie.objectfifo.subview.access %subviewTop0[0] : !aie.objectfifosubview> -> memref<16xi32> + %reinterpret_cast_0 = memref.reinterpret_cast %elemTop0 to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> + func.call @some_work(%reinterpret_cast_0, %c0) : (memref<4x4xi32>, index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + scf.for %indexInHeight = %c1 to %c21 step %c1 { + %subview = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> + %reinterpret_cast_1 = memref.reinterpret_cast %elem0 to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> + func.call @some_work(%reinterpret_cast_1, %indexInHeight) : (memref<4x4xi32>, index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + } + %subviewTop1 = aie.objectfifo.acquire @loop_of (Produce, 1) : !aie.objectfifosubview> + %elemTop1 = aie.objectfifo.subview.access %subviewTop1[0] : !aie.objectfifosubview> -> memref<16xi32> + %reinterpret_cast_2 = memref.reinterpret_cast %elemTop1 to offset: [0], sizes: [4, 4], strides: [4, 1] : memref<16xi32> to memref<4x4xi32> + func.call @some_work(%reinterpret_cast_2, %c0) : (memref<4x4xi32>, index) -> () + aie.objectfifo.release @loop_of (Produce, 1) + } + aie.end + } + } +} diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index 24c1f06965..79a1b71881 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # TRACE: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # TRACE: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # TRACE: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# TRACE: aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# TRACE: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # TRACE: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} import sys diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir new file mode 100644 index 0000000000..c1d3d88390 --- /dev/null +++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi16>, %arg1: memref<1024xi16>, %arg2: memref<1024xi16>) { + memref.assume_alignment %arg0, 32 : memref<1024xi16> + memref.assume_alignment %arg1, 32 : memref<1024xi16> + memref.assume_alignment %arg2, 32 : memref<1024xi16> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi16> + %1 = affine.load %arg1[%arg3] : memref<1024xi16> + %2 = arith.maxsi %0, %1 : i16 + affine.store %2, %arg2[%arg3] : memref<1024xi16> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir index 1ae4491387..339ab0b0cc 100644 --- a/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir +++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/i16xi16_max_elem.mlir @@ -1,7 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED diff --git a/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc index f8fe0a969f..60a6264401 100644 --- a/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i16xi16_max_elem/testbench.cc @@ -4,7 +4,19 @@ #include #include #include + +#ifdef TO_CPP void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int16_t *in0_allocated, int16_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int16_t *in1_allocated, + int16_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int16_t *out0_allocated, int16_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif + void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0); alignas(32) int16_t g_in0[IN0_SIZE]; @@ -26,7 +38,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir new file mode 100644 index 0000000000..dfdf3d3db0 --- /dev/null +++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi16>, %arg1: memref<1024xi16>, %arg2: memref<1024xi16>) { + memref.assume_alignment %arg0, 32 : memref<1024xi16> + memref.assume_alignment %arg1, 32 : memref<1024xi16> + memref.assume_alignment %arg2, 32 : memref<1024xi16> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi16> + %1 = affine.load %arg1[%arg3] : memref<1024xi16> + %2 = arith.minsi %0, %1 : i16 + affine.store %2, %arg2[%arg3] : memref<1024xi16> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir index c73cd2137d..377a4b42b5 100644 --- a/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir +++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/i16xi16_min_elem.mlir @@ -1,7 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED diff --git a/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc index 9d7a4bd39d..35240fbda2 100644 --- a/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i16xi16_min_elem/testbench.cc @@ -4,7 +4,17 @@ #include #include #include +#ifdef TO_CPP void dut(int16_t *restrict in0, int16_t *restrict in1, int16_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int16_t *in0_allocated, int16_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int16_t *in1_allocated, + int16_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int16_t *out0_allocated, int16_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif void dut_ref(int16_t *in0, int16_t *in1, int16_t *out0); alignas(32) int16_t g_in0[IN0_SIZE]; @@ -26,7 +36,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir new file mode 100644 index 0000000000..0e3c3590b3 --- /dev/null +++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) { + memref.assume_alignment %arg0, 32 : memref<1024xi32> + memref.assume_alignment %arg1, 32 : memref<1024xi32> + memref.assume_alignment %arg2, 32 : memref<1024xi32> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi32> + %1 = affine.load %arg1[%arg3] : memref<1024xi32> + %2 = arith.maxsi %0, %1 : i32 + affine.store %2, %arg2[%arg3] : memref<1024xi32> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir index dc5be1d5f3..7ffd6698d0 100644 --- a/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir +++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/i32xi32_max_elem.mlir @@ -1,7 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED diff --git a/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc index 5c0c91ff42..036f7ecb85 100644 --- a/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i32xi32_max_elem/testbench.cc @@ -4,7 +4,19 @@ #include #include #include + +#ifdef TO_CPP void dut(int32_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int32_t *in0_allocated, int32_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int32_t *in1_allocated, + int32_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int32_t *out0_allocated, int32_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif + void dut_ref(int32_t *in0, int32_t *in1, int32_t *out0); alignas(32) int32_t g_in0[IN0_SIZE]; @@ -26,7 +38,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir new file mode 100644 index 0000000000..96c20735d6 --- /dev/null +++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) { + memref.assume_alignment %arg0, 32 : memref<1024xi32> + memref.assume_alignment %arg1, 32 : memref<1024xi32> + memref.assume_alignment %arg2, 32 : memref<1024xi32> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi32> + %1 = affine.load %arg1[%arg3] : memref<1024xi32> + %2 = arith.minsi %0, %1 : i32 + affine.store %2, %arg2[%arg3] : memref<1024xi32> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir index bf2db4c50a..e0de66a437 100644 --- a/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir +++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/i32xi32_min_elem.mlir @@ -1,7 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED diff --git a/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc index b4e019a3ce..f8ee9c2716 100644 --- a/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i32xi32_min_elem/testbench.cc @@ -4,7 +4,19 @@ #include #include #include + +#ifdef TO_CPP void dut(int32_t *restrict in0, int32_t *restrict in1, int32_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int32_t *in0_allocated, int32_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int32_t *in1_allocated, + int32_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int32_t *out0_allocated, int32_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif + void dut_ref(int32_t *in0, int32_t *in1, int32_t *out0); alignas(32) int32_t g_in0[IN0_SIZE]; @@ -26,7 +38,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir new file mode 100644 index 0000000000..9745852914 --- /dev/null +++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) { + memref.assume_alignment %arg0, 32 : memref<1024xi8> + memref.assume_alignment %arg1, 32 : memref<1024xi8> + memref.assume_alignment %arg2, 32 : memref<1024xi8> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi8> + %1 = affine.load %arg1[%arg3] : memref<1024xi8> + %2 = arith.maxsi %0, %1 : i8 + affine.store %2, %arg2[%arg3] : memref<1024xi8> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir index 6e869eb9e4..b69055b52a 100644 --- a/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir +++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/i8xi8_max_elem.mlir @@ -1,19 +1,22 @@ -// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED module { func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) { - %c0_i8 = arith.constant 0 : i8 - affine.for %arg3 = 0 to 1024 step 32 { - %0 = vector.transfer_read %arg0[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8> - %1 = vector.transfer_read %arg1[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8> - %2 = arith.maxsi %0, %1 : vector<64xi8> - vector.transfer_write %2, %arg2[%arg3] : vector<64xi8>, memref<1024xi8> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi8> + %1 = affine.load %arg1[%arg3] : memref<1024xi8> + %2 = arith.maxsi %0, %1 : i8 + affine.store %2, %arg2[%arg3] : memref<1024xi8> } return } diff --git a/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc b/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc index 6e3f1ba3d9..f9fdb84a62 100644 --- a/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i8xi8_max_elem/testbench.cc @@ -4,7 +4,19 @@ #include #include #include + +#ifdef TO_CPP void dut(int8_t *restrict in0, int8_t *restrict in1, int8_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int8_t *in0_allocated, int8_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int8_t *in1_allocated, + int8_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int8_t *out0_allocated, int8_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif + void dut_ref(int8_t *in0, int8_t *in1, int8_t *out0); alignas(32) int8_t g_in0[IN0_SIZE]; @@ -26,7 +38,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir new file mode 100644 index 0000000000..0b02971e0d --- /dev/null +++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem-llvm.mlir @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// REQUIRES: peano +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" %vector-to-llvmir% -o llvmir.mlir +// RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll +// RUN: %PEANO_INSTALL_DIR/bin/clang %clang_aie2_args -c dut.ll -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_LLVM +w work +o work -I%S -I. %S/testbench.cc dut.o +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED + +module { + func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) { + memref.assume_alignment %arg0, 32 : memref<1024xi8> + memref.assume_alignment %arg1, 32 : memref<1024xi8> + memref.assume_alignment %arg2, 32 : memref<1024xi8> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi8> + %1 = affine.load %arg1[%arg3] : memref<1024xi8> + %2 = arith.minsi %0, %1 : i8 + affine.store %2, %arg2[%arg3] : memref<1024xi8> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir index 0e8522b990..dcdf2cc288 100644 --- a/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir +++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/i8xi8_min_elem.mlir @@ -1,19 +1,22 @@ -// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -c dut.cc -o dut.o -// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc work/dut.o -// RUN: mkdir -p data +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + +// REQUIRES: valid_xchess_license +// RUN: mkdir -p %t/data; cd %t +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=64" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper %xchesscc_aie2_args +w work +o work -I%S -I. -c dut.cc -o dut.o +// RUN: xchesscc_wrapper %xchesscc_aie2_args -DTO_CPP +w work +o work -I%S -I. %S/testbench.cc work/dut.o // RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s // CHECK: TEST PASSED module { func.func @dut(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>, %arg2: memref<1024xi8>) { - %c0_i8 = arith.constant 0 : i8 - affine.for %arg3 = 0 to 1024 step 32 { - %0 = vector.transfer_read %arg0[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8> - %1 = vector.transfer_read %arg1[%arg3], %c0_i8 : memref<1024xi8>, vector<64xi8> - %2 = arith.minsi %0, %1 : vector<64xi8> - vector.transfer_write %2, %arg2[%arg3] : vector<64xi8>, memref<1024xi8> + affine.for %arg3 = 0 to 1024 { + %0 = affine.load %arg0[%arg3] : memref<1024xi8> + %1 = affine.load %arg1[%arg3] : memref<1024xi8> + %2 = arith.minsi %0, %1 : i8 + affine.store %2, %arg2[%arg3] : memref<1024xi8> } return } diff --git a/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc b/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc index b4b1102193..0aed91af2f 100644 --- a/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc +++ b/test/unit_tests/aievec_tests/i8xi8_min_elem/testbench.cc @@ -4,7 +4,19 @@ #include #include #include + +#ifdef TO_CPP void dut(int8_t *restrict in0, int8_t *restrict in1, int8_t *restrict out0); +#elif TO_LLVM +extern "C" { +void dut(int8_t *in0_allocated, int8_t *in0_aligned, int64_t in0_offset, + int64_t in0_sizes_0, int64_t in0_strides_0, int8_t *in1_allocated, + int8_t *in1_aligned, int64_t in1_offset, int64_t in1_sizes_0, + int64_t in1_strides_0, int8_t *out0_allocated, int8_t *out0_aligned, + int64_t out0_offset, int64_t out0_sizes_0, int64_t out0_strides_0); +} +#endif + void dut_ref(int8_t *in0, int8_t *in1, int8_t *out0); alignas(32) int8_t g_in0[IN0_SIZE]; @@ -26,7 +38,11 @@ int main(int argc, char *argv[]) { chess_memory_fence(); auto cyclesBegin = chess_cycle_count(); +#ifdef TO_CPP dut(g_in0, g_in1, g_out0); +#elif TO_LLVM + dut(g_in0, g_in0, 0, 0, 0, g_in1, g_in1, 0, 0, 0, g_out0, g_out0, 0, 0, 0); +#endif auto cyclesEnd = chess_cycle_count(); chess_memory_fence(); diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp index ce9746fe2e..86d5084a40 100644 --- a/tools/aie2xclbin/XCLBinGen.cpp +++ b/tools/aie2xclbin/XCLBinGen.cpp @@ -393,47 +393,46 @@ static json::Object makeKernelJSON(std::string name, std::string id, {"name", name}, {"type", "dpu"}, {"extended-data", json::Object{{"subtype", "DPU"}, - {"functional", "1"}, + {"functional", "0"}, {"dpu_kernel_id", id}}}, - {"arguments", json::Array{json::Object{{"name", "instr"}, + {"arguments", json::Array{json::Object{{"name", "opcode"}, + {"address-qualifier", "SCALAR"}, + {"type", "uint64_t"}, + {"offset", "0x00"}}, + json::Object{{"name", "instr"}, {"memory-connection", "SRAM"}, {"address-qualifier", "GLOBAL"}, {"type", "char *"}, - {"offset", "0x00"}}, + {"offset", "0x08"}}, json::Object{{"name", "ninstr"}, {"address-qualifier", "SCALAR"}, - {"type", "uint64_t"}, - {"offset", "0x08"}}, + {"type", "uint32_t"}, + {"offset", "0x10"}}, json::Object{{"name", "bo0"}, {"memory-connection", "HOST"}, {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x10"}}, + {"type", "void*"}, + {"offset", "0x14"}}, json::Object{{"name", "bo1"}, {"memory-connection", "HOST"}, {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x18"}}, + {"type", "void*"}, + {"offset", "0x1c"}}, json::Object{{"name", "bo2"}, {"memory-connection", "HOST"}, {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x20"}}, + {"type", "void*"}, + {"offset", "0x24"}}, json::Object{{"name", "bo3"}, {"memory-connection", "HOST"}, {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x28"}}, + {"type", "void*"}, + {"offset", "0x2c"}}, json::Object{{"name", "bo4"}, {"memory-connection", "HOST"}, {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x30"}}, - json::Object{{"name", "bo5"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x38"}}}}, + {"type", "void*"}, + {"offset", "0x34"}}}}, {"instances", json::Array{json::Object{{"name", instance}}}}}; } diff --git a/utils/quick_setup.sh b/utils/quick_setup.sh index 27f4ab9410..6a47f28faf 100755 --- a/utils/quick_setup.sh +++ b/utils/quick_setup.sh @@ -62,10 +62,10 @@ if test -f "$VPP"; then mkdir -p my_install pushd my_install # pip download mlir_aie -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels/ - wget -q --show-progress https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024052821+aa2178a-py3-none-manylinux_2_35_x86_64.whl + wget -q --show-progress https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061022+9716dc7-py3-none-manylinux_2_35_x86_64.whl unzip -q mlir_aie-*_x86_64.whl # pip download mlir -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/mlir-distro/ - wget -q --show-progress https://github.com/Xilinx/mlir-aie/releases/download/mlir-distro/mlir-19.0.0.2024052220+25b65be4-py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + wget -q --show-progress https://github.com/Xilinx/mlir-aie/releases/download/mlir-distro/mlir-19.0.0.2024060317+a088c61d-py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl unzip -q mlir-*_x86_64.whl pip install https://github.com/makslevental/mlir-python-extras/archive/d84f05582adb2eed07145dabce1e03e13d0e29a6.zip rm -rf mlir*.whl