Skip to content

Commit

Permalink
Switch to using transaction binary flow with no control packet (#1517)
Browse files Browse the repository at this point in the history
Co-authored-by: Joseph Melber <jmelber@xilinx.com>
Co-authored-by: Jeff Fifield <jeff.fifield@amd.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Jun 10, 2024
1 parent 7635c9e commit 1a362e6
Show file tree
Hide file tree
Showing 75 changed files with 700 additions and 586 deletions.
42 changes: 31 additions & 11 deletions include/aie/Dialect/AIEX/IR/AIEX.td
Original file line number Diff line number Diff line change
Expand Up @@ -567,17 +567,20 @@ def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> {
}

// Push BD to Queue
def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> {
def AIE_NpuPushQueueOp: AIEX_Op<"npu.push_queue", []> {
let summary = "bd queue push operator";
let arguments = (
ins FlatSymbolRefAttr:$metadata,
ins I32Attr:$column,
I32Attr:$row,
DMAChannelDir:$direction,
I32Attr:$channel,
BoolAttr:$issue_token,
I32Attr:$repeat_count,
I32Attr:$bd_id
);
let results = (outs );
let assemblyFormat = [{
attr-dict
`(` $column `,` $row `,` $direction `:` $channel `)` attr-dict
}];
let hasVerifier = 1;
let description = [{
Expand All @@ -589,10 +592,10 @@ def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> {
def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> {
let summary = "write32 operator";
let arguments = (
ins I32Attr:$column,
I32Attr:$row,
UI32Attr:$address,
UI32Attr:$value
ins UI32Attr:$address,
UI32Attr:$value,
OptionalAttr<I32Attr>:$column,
OptionalAttr<I32Attr>:$row
);
let results = (outs );
let assemblyFormat = [{
Expand Down Expand Up @@ -623,12 +626,28 @@ def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
}];
}

// WRITEBD_EXTEND_SHIMTILE
def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> {
// XAIE_IO_CUSTOM_OP_BEGIN + 1 (address patch)
def AIE_NpuAddressPatchOp: AIEX_Op<"npu.address_patch", []> {
let summary = "address patch operator";
let arguments = (
ins UI32Attr:$addr,
I32Attr:$arg_idx,
I32Attr:$arg_plus
);
let results = (outs );
let assemblyFormat = [{
attr-dict
}];
let description = [{
address patch operator
}];
}

// NPU Bd Write operation
def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> {
let summary = "dma operator";
let arguments = (
ins I32Attr:$column,
I32Attr:$column_num,
I32Attr:$ddr_id,
I32Attr:$bd_id,
I32Attr:$buffer_length,
Expand All @@ -646,6 +665,7 @@ def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> {
I32Attr:$iteration_size,
I32Attr:$iteration_stride,
I32Attr:$next_bd,
I32Attr:$row,
I32Attr:$use_next_bd,
I32Attr:$valid_bd,
I32Attr:$lock_rel_val,
Expand All @@ -658,7 +678,7 @@ def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> {
let assemblyFormat = [{ attr-dict }];
let hasVerifier = 1;
let description = [{
writebd_shimtile operator
writebd operator
}];
}

Expand Down
8 changes: 4 additions & 4 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,19 @@ LogicalResult AIEX::NpuDmaWaitOp::verify() {
return success();
}

LogicalResult AIEX::NpuShimTilePushQueueOp::verify() {
LogicalResult AIEX::NpuPushQueueOp::verify() {
const auto &targetModel = AIE::getTargetModel(*this);
auto numBds = targetModel.getNumBDs(0, 0); // assume shim
auto numBds = targetModel.getNumBDs(getColumn(), getRow());
if (getBdId() > numBds)
return emitOpError("BD ID exceeds the maximum ID.");
if (getRepeatCount() > 255)
return emitOpError("Repeat count exceeds the [0:255] range.");
return success();
}

LogicalResult AIEX::NpuWriteBdExShimTileOp::verify() {
LogicalResult AIEX::NpuWriteBdOp::verify() {
const auto &targetModel = AIE::getTargetModel(*this);
auto numBds = targetModel.getNumBDs(0, 0); // assume shim
auto numBds = targetModel.getNumBDs(getColumn(), getRow());
if (getBdId() > numBds)
return emitOpError("BD ID exceeds the maximum ID.");
if (getD0Size() > 0x3FF)
Expand Down
96 changes: 37 additions & 59 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,73 +110,48 @@ struct RtpToNpuPattern : OpConversionPattern<NpuWriteRTPOp> {
IntegerAttr row = IntegerAttr::get(i32ty, r);
IntegerAttr address = IntegerAttr::get(ui32ty, rtp_buffer_addr);
IntegerAttr value = IntegerAttr::get(i32ty, v);
rewriter.create<NpuWrite32Op>(op->getLoc(), column.getInt(), row.getInt(),
address.getUInt(), value.getInt());
rewriter.create<NpuWrite32Op>(op->getLoc(), address.getUInt(),
value.getInt(), column, row);

rewriter.eraseOp(op);
return success();
}
};

struct PushToNpuPattern : OpConversionPattern<NpuShimTilePushQueueOp> {

private:
ShimDMAllocationGetter &allocGetter;
struct PushToNpuPattern : OpConversionPattern<NpuPushQueueOp> {

public:
using OpConversionPattern::OpConversionPattern;

PushToNpuPattern(MLIRContext *context, ShimDMAllocationGetter &getter,
PatternBenefit benefit = 1)
: OpConversionPattern(context, benefit), allocGetter(getter) {}
PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
: OpConversionPattern(context, benefit) {}

LogicalResult
matchAndRewrite(NpuShimTilePushQueueOp op, OpAdaptor adaptor,
matchAndRewrite(NpuPushQueueOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto *ctx = op->getContext();
auto i32ty = IntegerType::get(ctx, 32);
auto zero = IntegerAttr::get(i32ty, 0);
auto ui32ty =
IntegerType::get(ctx, 32, IntegerType::SignednessSemantics::Unsigned);
bool send_tct = op.getIssueToken();
uint32_t channel_num = 0;

// initialize fields to zero
auto dev = op->getParentOfType<AIE::DeviceOp>();
if (!dev)
return op->emitOpError("couldn't find parent of type DeviceOp");

auto infoOp = allocGetter.get(dev, op.getMetadata());
if (!infoOp)
return op->emitOpError("couldn't find shim_dma_allocation op.");

auto channelDir = infoOp->getChannelDir();
bool isMM2S = channelDir == AIE::DMAChannelDir::MM2S;
channel_num += infoOp->getChannelIndex();

IntegerAttr column = IntegerAttr::get(i32ty, infoOp->getCol());

// the offset of the task queue register in the tile
uint32_t queue_offset;
if (isMM2S)
if (op.getDirection() == AIE::DMAChannelDir::MM2S)
queue_offset = 0x1D214;
else
queue_offset = 0x1D204;
if (channel_num == 1)
if (op.getChannel() == 1)
queue_offset += 0x8;
IntegerAttr address = IntegerAttr::get(ui32ty, queue_offset);

// value
// the value to write
uint32_t bd_id = op.getBdId();
uint32_t repeat_cnt = op.getRepeatCount();
uint32_t cmd = 0;
cmd |= bd_id & 0xF;
cmd |= (repeat_cnt & 0xFF) << 16;
if (send_tct)
if (op.getIssueToken())
cmd |= 0x80000000;
IntegerAttr value = IntegerAttr::get(ui32ty, cmd);

rewriter.create<NpuWrite32Op>(op->getLoc(), column.getInt(), zero.getInt(),
address.getUInt(), value.getUInt());
auto i32ty = IntegerType::get(op->getContext(), 32);
auto column = IntegerAttr::get(i32ty, op.getColumn());
auto row = IntegerAttr::get(i32ty, 0);
rewriter.create<NpuWrite32Op>(op->getLoc(), queue_offset, cmd, column, row);
rewriter.eraseOp(op);
return success();
}
Expand Down Expand Up @@ -216,7 +191,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {

// initialize fields to zero
auto column = zero;
auto column_num = zero;
auto ddr_id = zero;
auto bd_id = zero;
auto buffer_length = zero;
Expand All @@ -234,6 +208,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
auto iteration_size = zero;
auto iteration_stride = zero;
auto next_bd = zero;
auto row = zero;
auto use_next_bd = zero;
auto valid_bd = zero;
auto lock_rel_val = zero;
Expand All @@ -258,9 +233,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
// column
column = IntegerAttr::get(i32ty, col);

// column_num
column_num = IntegerAttr::get(i32ty, 1);

// ddr_id
Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
int arg_idx = -1;
Expand Down Expand Up @@ -364,15 +336,23 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
if (!isMM2S)
issue_token = BoolAttr::get(ctx, true);

(void)rewriter.create<NpuWriteBdExShimTileOp>(
op->getLoc(), column, column_num, ddr_id, bd_id, buffer_length,
buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type,
d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current,
iteration_size, iteration_stride, next_bd, use_next_bd, valid_bd,
rewriter.create<NpuWriteBdOp>(
op->getLoc(), column, ddr_id, bd_id, buffer_length, buffer_offset,
enable_packet, out_of_order_id, packet_id, packet_type, d0_size,
d0_stride, d1_size, d1_stride, d2_stride, iteration_current,
iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd,
lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id);

rewriter.create<NpuShimTilePushQueueOp>(op->getLoc(), op.getMetadataAttr(),
issue_token, repeat_count, bd_id);
const AIE::AIETargetModel &tm =
op->getParentOfType<AIE::DeviceOp>().getTargetModel();

uint32_t addr =
(col << tm.getColumnShift()) | (0x1D004 + op.getId() * 0x20);
rewriter.create<NpuAddressPatchOp>(op->getLoc(), addr, arg_idx, offset);

rewriter.create<NpuPushQueueOp>(
op->getLoc(), column, row, infoOp->getChannelDirAttr(),
infoOp->getChannelIndexAttr(), issue_token, repeat_count, bd_id);

rewriter.eraseOp(op);
return success();
Expand Down Expand Up @@ -406,15 +386,13 @@ struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
if (!shimDmaAllocOp) {
return op->emitError("couldn't find shim_dma_allocation op");
}
AIE::DMAChannelDir channelDir = shimDmaAllocOp->getChannelDir();
int channel = shimDmaAllocOp->getChannelIndex();
int direction = (int)(channelDir == AIE::DMAChannelDir::MM2S);
int column = shimDmaAllocOp->getCol();

// Create with `column_num == 1` and `row_num == 1` to check for a single
// column and row. Row is always 0 for shim tiles.
(void)rewriter.replaceOpWithNewOp<NpuSyncOp>(op, column, 0, direction,
channel, 1, 1);
(void)rewriter.replaceOpWithNewOp<NpuSyncOp>(
op, shimDmaAllocOp->getCol(), /* row */ 0,
static_cast<uint32_t>(shimDmaAllocOp->getChannelDir()),
shimDmaAllocOp->getChannelIndex(), 1, 1);
return success();
}
};
Expand All @@ -433,12 +411,12 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
target.addIllegalOp<NpuWriteRTPOp>();
target.addIllegalOp<NpuDmaMemcpyNdOp>();
target.addIllegalOp<NpuDmaWaitOp>();
target.addIllegalOp<NpuShimTilePushQueueOp>();
target.addIllegalOp<NpuPushQueueOp>();

RewritePatternSet patterns(&getContext());
patterns.insert<DmaToNpuPattern>(&getContext(), cachingGetter);
patterns.insert<DmaWaitToNpuPattern>(&getContext(), cachingGetter);
patterns.insert<PushToNpuPattern>(&getContext(), cachingGetter);
patterns.insert<PushToNpuPattern>(&getContext());
patterns.insert<RtpToNpuPattern>(&getContext());

if (failed(applyPartialConversion(device, target, std::move(patterns))))
Expand Down
6 changes: 2 additions & 4 deletions lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,11 @@ struct AIEXToStandardPass : AIEXToStandardBase<AIEXToStandardPass> {
RewritePatternSet removepatterns(&getContext());
removepatterns.add<AIEXOpRemoval<NpuDmaMemcpyNdOp>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuDmaWaitOp>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuShimTilePushQueueOp>>(m.getContext(),
m);
removepatterns.add<AIEXOpRemoval<NpuPushQueueOp>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuWriteRTPOp>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuWrite32Op>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuSyncOp>>(m.getContext(), m);
removepatterns.add<AIEXOpRemoval<NpuWriteBdExShimTileOp>>(m.getContext(),
m);
removepatterns.add<AIEXOpRemoval<NpuWriteBdOp>>(m.getContext(), m);

if (failed(applyPartialConversion(m, target, std::move(removepatterns))))
signalPassFailure();
Expand Down
Loading

0 comments on commit 1a362e6

Please sign in to comment.