Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add repeat support for compute tiles #1842

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
92fa41b
Add repeat support for compute tiles.
abisca Oct 16, 2024
d8c177d
Add test
abisca Oct 16, 2024
edab881
Remove unnecessary file
abisca Oct 16, 2024
d530ea9
Add min value constraint for repeat_count
abisca Oct 16, 2024
7cf14b1
Merge with main and fix conflicts. Temporarily keep deleted prog_exam…
abisca Oct 18, 2024
4404ae1
Update lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
AndraBisca Oct 18, 2024
26a5725
Fix tests. Remove generation of repeatCount attribute for compute til…
abisca Oct 18, 2024
839c11d
Merge branch 'compute-tile-repeat' of https://github.com/Xilinx/mlir-…
abisca Oct 18, 2024
dc2bcb0
Remove ability to infer repeat from data layout transformations.
abisca Oct 18, 2024
afa59fe
Update lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
AndraBisca Oct 18, 2024
ebaeaef
Merge branch 'main' of https://github.com/Xilinx/mlir-aie into comput…
abisca Oct 18, 2024
44a240f
Can see error with test.cpp locally; try redirecting stderr to stdout…
hunhoffe Oct 21, 2024
ae94b44
Fix discrepencies in test.cpp between distribute-repeat test and example
hunhoffe Oct 21, 2024
a7b0d64
Fix discrepencies between test.cpp in simple_repeat test vs programmi…
hunhoffe Oct 21, 2024
46052e9
fix typo
hunhoffe Oct 21, 2024
5ecaf9c
Deleted repeat from programming examples (this is replaced by test/np…
hunhoffe Oct 21, 2024
b26795d
Remove unused makefiles
hunhoffe Oct 21, 2024
aafda43
Update READMEs
abisca Oct 21, 2024
5634023
Fix test
abisca Oct 21, 2024
532f016
Merge branch 'main' of https://github.com/Xilinx/mlir-aie into comput…
abisca Oct 21, 2024
d4a8109
Merge branch 'main' into compute-tile-repeat
AndraBisca Oct 21, 2024
636a0a0
Always use DMAs if repeat count is specified
abisca Oct 24, 2024
a709362
Documentation
abisca Oct 30, 2024
085e7e2
Merge branch 'compute-tile-repeat' of https://github.com/Xilinx/mlir-…
abisca Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/aie/Dialect/AIE/IR/AIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1684,8 +1684,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
// via_shared_mem==0 means use producer tile's memory module
// via_shared_mem==1 means use consumer tile's memory module
OptionalAttr<AIEI32Attr>:$via_shared_mem,
// memtile_repeat==0 means "do it once" and don't repeat
OptionalAttr<AIEI32Attr>:$memtile_repeat
// repeat_count==1 means "do it once"
OptionalAttr<ConfinedAttr<AIEI32Attr, [IntMinValue<1>]>>:$repeat_count
);

let assemblyFormat = [{
Expand Down
25 changes: 16 additions & 9 deletions lib/Dialect/AIE/IR/AIEDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,10 +494,9 @@ LogicalResult ObjectFifoCreateOp::verify() {
"`via_shared_mem` can only be used in 1-to-1 object FIFOs");
}

if (getMemtileRepeat().has_value()) {
if (!getProducerTileOp().isMemTile())
return emitError("`memtile_repeat` can only be used with a mem tile "
"producer");
if (getRepeatCount().has_value()) {
if (getProducerTileOp().isShimTile())
return emitError("`repeat_count` unavailable for shim tiles");
}

return success();
Expand Down Expand Up @@ -602,10 +601,18 @@ LogicalResult ObjectFifoLinkOp::verify() {
return emitError("ObjectFifoLinkOp does not support 'join' and "
"'distribute' at the same time");

if (auto sharedTile = getOptionalSharedTile(); !sharedTile)
auto sharedTile = getOptionalSharedTile();
if (!sharedTile)
return emitError("ObjectFifoLinkOp must have a link point, i.e., a "
"shared tile between objectFifos");

TileOp tile = cast<TileOp>(sharedTile.value().getDefiningOp());
if (!tile.isMemTile()) {
if (isJoin() || isDistribute())
return emitError("ObjectFifoLinkOp join and distribute are "
"unavailable on compute or shim tiles");
}

if (isJoin()) {
if (getFifoIns().size() != getSrcOffsets().size())
return emitOpError("number of provided src offsets must be equal "
Expand Down Expand Up @@ -643,8 +650,8 @@ LogicalResult ObjectFifoLinkOp::verify() {

std::vector<int> repeat_counts;
for (auto fifoOut : getOutputObjectFifos()) {
if (fifoOut.getMemtileRepeat().has_value())
repeat_counts.push_back(fifoOut.getMemtileRepeat().value());
if (fifoOut.getRepeatCount().has_value())
repeat_counts.push_back(fifoOut.getRepeatCount().value());
else
repeat_counts.push_back(0);
}
Expand Down Expand Up @@ -761,8 +768,8 @@ std::vector<int> ObjectFifoLinkOp::getDistributeTransferLengths() {

std::optional<int> ObjectFifoLinkOp::getRepeatCount() {
for (auto fifoOut : getOutputObjectFifos())
if (fifoOut.getMemtileRepeat().has_value())
return {fifoOut.getMemtileRepeat().value()};
if (fifoOut.getRepeatCount().has_value())
return {fifoOut.getRepeatCount().value()};
return {};
}

Expand Down
54 changes: 33 additions & 21 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ struct AIEObjectFifoStatefulTransformPass
// the objectFifo broadcasts to multiple tiles, if one of the consumers or
// the producer wants to use the multi-dimensional address generation
// features of the DMA, if the objectFifo is part of a LinkOp, or if the
// via_DMA attribute of the objectFifo is set.
// via_DMA or repeatCount attributes of the objectFifo are set.
bool requiresDMAs(ObjectFifoCreateOp createOp, int &share_direction) {
bool hasSharedMemory = false;
bool atLeastOneConsumerWantsTransform = false;
Expand All @@ -185,6 +185,9 @@ struct AIEObjectFifoStatefulTransformPass
if (createOp.getVia_DMA())
return true;

if (createOp.getRepeatCount().has_value())
return true;

if (createOp.getConsumerTiles().size() == 1 &&
createOp.getDimensionsToStream().empty()) {

Expand Down Expand Up @@ -431,7 +434,7 @@ struct AIEObjectFifoStatefulTransformPass
}
if (linked) {
if (linkOp->getRepeatCount().has_value())
numElem *= linkOp->getRepeatCount().value() + 1;
numElem *= linkOp->getRepeatCount().value();
if (linkOp->isDistribute())
numElem *= linkOp->getFifoOuts().size();
else if (linkOp->isJoin())
Expand Down Expand Up @@ -542,12 +545,25 @@ struct AIEObjectFifoStatefulTransformPass
auto elemType = llvm::cast<MemRefType>(fifo.getElementType());
int len = elemType.getNumElements();

// check for repeat count
int repeatCount = 1;
if (op.getRepeatCount().has_value())
repeatCount = op.getRepeatCount().value();

// search for the buffers/locks (based on if this objFifo has a link)
ObjectFifoCreateOp target = op;
if (std::optional<ObjectFifoLinkOp> linkOp = getOptionalLinkOp(op);
linkOp.has_value())
if (objFifoLinks.find(linkOp.value()) != objFifoLinks.end())
linkOp.has_value()) {
if (objFifoLinks.find(linkOp.value()) != objFifoLinks.end()) {
target = objFifoLinks[linkOp.value()];
if (target == op) {
if (linkOp->getRepeatCount().has_value()) {
acqNum *= linkOp->getRepeatCount().value();
relNum *= linkOp->getRepeatCount().value();
}
}
}
}

// search for MemOp
Operation *producerMem = nullptr;
Expand Down Expand Up @@ -582,7 +598,7 @@ struct AIEObjectFifoStatefulTransformPass
// create DMA channel
builder.setInsertionPointToStart(dmaBlock);
builder.create<DMAStartOp>(builder.getUnknownLoc(), channelDir,
channelIndex, /*repeatCount*/ 0, bdBlock,
channelIndex, repeatCount - 1, bdBlock,
endBlock);
if (lastDmaBlock != nullptr)
lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1);
Expand Down Expand Up @@ -701,17 +717,9 @@ struct AIEObjectFifoStatefulTransformPass
int relNum = 1;

// check for repeat count
int repeatCount = 0;
if (!dims.getValue().empty()) {
auto highestStride = dims.getValue().begin()->getStride() - 1;
if (highestStride == 0) {
repeatCount = dims.getValue().begin()->getSize();
dims = AIE::BDDimLayoutArrayAttr::get(op->getContext(),
dims.getValue().drop_front(1));
}
}
if (op.getMemtileRepeat().has_value())
repeatCount = op.getMemtileRepeat().value();
int repeatCount = 1;
if (op.getRepeatCount().has_value())
repeatCount = op.getRepeatCount().value();

// search for the buffers/locks (based on if this objFifo has a link)
// identify size difference between input and output memrefs
Expand All @@ -727,9 +735,8 @@ struct AIEObjectFifoStatefulTransformPass

if (target == op) {
if (linkOp->getRepeatCount().has_value()) {
// +1 for original data movement
acqNum *= linkOp->getRepeatCount().value() + 1;
relNum *= linkOp->getRepeatCount().value() + 1;
acqNum *= linkOp->getRepeatCount().value();
relNum *= linkOp->getRepeatCount().value();
}
}

Expand Down Expand Up @@ -817,7 +824,8 @@ struct AIEObjectFifoStatefulTransformPass
// create DMA channel
builder.setInsertionPointToStart(dmaBlock);
builder.create<DMAStartOp>(builder.getUnknownLoc(), channelDir,
channelIndex, repeatCount, bdBlock, endBlock);
channelIndex, repeatCount - 1, bdBlock,
endBlock);
if (lastDmaBlock != nullptr)
lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1);

Expand Down Expand Up @@ -1322,8 +1330,12 @@ struct AIEObjectFifoStatefulTransformPass

// Only FIFOs using DMA are split into two ends;
// skip in shared memory case
if (int share_direction = 0; !requiresDMAs(createOp, share_direction))
if (int share_direction = 0; !requiresDMAs(createOp, share_direction)) {
if (createOp.getRepeatCount().has_value())
createOp->emitWarning("Repeat unavailable for tiles sharing memory; "
"ignoring `repeat_count`");
continue;
}

for (auto consumerTile : createOp.getConsumerTiles()) {
auto consumerTileOp = dyn_cast<TileOp>(consumerTile.getDefiningOp());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ object_fifo_link([of1, of2], of0)
A full design example that uses these features is available in Section 2e: [05_join_L2](../../section-2e/05_join_L2/).

-----
[[Prev](../02_Broadcast/)] [[Up](..)] [[Next - Section 2c](../../section-2c/)]
[[Prev](../02_Broadcast/)] [[Up](..)] [[Next](../04_Repeat/)]
4 changes: 4 additions & 0 deletions programming_guide/section-2/section-2b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ The Object FIFO primitive supports several data movement patterns. We will now d
* Distribute different pieces of the input data to multiple consumers
* Join outputs from different consumers into a bigger data tensor
</details>
<details><summary><a href="./04_Repeat/">Object FIFO Repeat Pattern</a></summary>

* Leverage Object FIFO Link to repeat data from the producer
</details>

-----
[[Prev - Section 2a](../section-2a/)] [[Up](..)] [[Next - Section 2c](../section-2c/)]
2 changes: 1 addition & 1 deletion programming_guide/section-2/section-2d/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def mlir_aie_design():
with mlir_mod_ctx() as ctx:

# Device declaration - aie2 device xcvc1902
@device(AIEDevice.xcvc1902)
@device(AIEDevice.npu1_1col)
def device_body():
data_ty = np.ndarray[(data_size,), np.dtype[np.int32]]

Expand Down
2 changes: 1 addition & 1 deletion programming_guide/section-2/section-2d/aie2_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def mlir_aie_design():
with mlir_mod_ctx() as ctx:

# Device declaration - aie2 device xcvc1902
@device(AIEDevice.xcvc1902)
@device(AIEDevice.npu1_1col)
def device_body():
tile_ty = np.ndarray[(tile_size,), np.dtype[np.int32]]
data_ty = np.ndarray[(data_size,), np.dtype[np.int32]]
Expand Down
4 changes: 2 additions & 2 deletions python/dialects/aie.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,9 +428,9 @@ def set_via_shared_mem(self, port):
int_num = IntegerAttr.get(T.i32(), num)
self.attributes["via_shared_mem"] = int_num

def set_memtile_repeat(self, num):
def set_repeat_count(self, num):
int_num = IntegerAttr.get(T.i32(), num)
self.attributes["memtile_repeat"] = int_num
self.attributes["repeat_count"] = int_num


# Create an aie objectFifo_link between input and output objectFifos.
Expand Down
23 changes: 23 additions & 0 deletions test/npu-xrt/objectfifo_repeat/compute_repeat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<!---//===- README.md ---------------------------------------*- Markdown -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//-->

# <ins>Compute Repeat</ins>

This reference design can be run on a Ryzen™ AI NPU.

In the [design](./aie2.py) data is brought from external memory via the `ShimTile` to the `ComputeTile` and back by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). Furthermore, the input data is repeated by the `ComputeTile` four times which results in the output data consisting of four instances of the input data.

The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md#object-fifo-link) of the programming guide.

The repeat count is specified as follows:
```python
of_out.set_repeat_count(repeat_count)
```
Specifically, the instruction above specifies the number of repetitions that the producer side of the `of_out` objectfifo should do.
78 changes: 78 additions & 0 deletions test/npu-xrt/objectfifo_repeat/compute_repeat/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# test/npu-xrt/objectfifo_repeat/aie2.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# REQUIRES: ryzen_ai, valid_xchess_license
#
# RUN: %python %S/aie2.py 4096 > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe -x final.xclbin -i insts.txt -k MLIR_AIE -l 4096 | FileCheck %s
# CHECK: PASS!
import numpy as np
import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx

N = 4096
dev = AIEDevice.npu1_1col
col = 0
repeat_count = 4

if len(sys.argv) > 1:
N = int(sys.argv[1])
data_out_size = N * repeat_count

if len(sys.argv) > 2:
if sys.argv[2] == "npu":
dev = AIEDevice.npu1_1col
elif sys.argv[2] == "xcvc1902":
dev = AIEDevice.xcvc1902
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2]))

if len(sys.argv) > 3:
col = int(sys.argv[3])


def compute_repeat():
with mlir_mod_ctx() as ctx:

@device(dev)
def device_body():
tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
tensor_out_ty = np.ndarray[(data_out_size,), np.dtype[np.int32]]

# Tile declarations
ShimTile = tile(col, 0)
ComputeTile = tile(col, 3)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile, 1, tensor_ty)
of_out = object_fifo("out", ComputeTile, ShimTile, 1, tensor_ty)
of_out.set_repeat_count(repeat_count)
object_fifo_link(of_in, of_out)

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty, tensor_out_ty)
def sequence(A, B, C):
npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(
metadata=of_out,
bd_id=0,
mem=C,
sizes=[1, 1, 1, data_out_size],
)
# of_out will only complete after of_in completes, so we just wait on of_out instead of both
dma_wait(of_out)

print(ctx.module)


compute_repeat()
Loading
Loading