Skip to content

Commit

Permalink
[Convolution] Packing + objectFifo, initial support (#789)
Browse files Browse the repository at this point in the history
This PR switches all numerical convolution tests to use the objectFifo
pipeline. With respect to the new tiling strategy:

1) A single **column** is currently used. Targeting multiple columns
results in ` error: 'aie.memtile_dma' op could not find and assign a
valid BD id`. This will will be investigated as follow-up work:
#821

2) There is no longer interleaving of compute and L2->L1 data movement,
which means #619 becomes
low priority / obsolete

3) L3->L2, L2->L3 still uses padding. But L2->L1, L1->L2 uses packing. 

4) Channel-first convolution is completely unsupported, we expect high
level transforms to convert to channel last before reaching our backend.

5) Vectorization is not currently enabled, due to issues with alignment.
See follow-up task #820.
This is functionally ok for now, as peano can scalarize code for all
data types.
  • Loading branch information
newling authored Oct 9, 2024
1 parent 653f441 commit c84cca0
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 109 deletions.
4 changes: 2 additions & 2 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def run(self, config):
config,
test_name,
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
n_repeats=n_conv_repeats,
)

Expand All @@ -661,7 +661,7 @@ def run(self, config):
config,
test_files_dir / f"{name}.mlir",
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
n_repeats=n_conv_repeats,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ iree_lit_test_suite(
NAME
lit
SRCS
"conv2d_nhwc_air_e2e.mlir"
"conv2d_nhwc_objectfifo_e2e.mlir"
"matmul_elementwise_pack_peel_air_e2e.mlir"
"matmul_pack_peel_air_e2e.mlir"
"matmul_pack_peel_objectfifo.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=air --split-input-file | FileCheck %s
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=objectFifo --split-input-file | FileCheck %s

func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32x64xi32>) -> tensor<2x12x12x64xi32> {
%cst = arith.constant 0 : i32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ namespace {
static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
RewriterBase &rewriter, linalg::LinalgOp op,
SmallVector<OpFoldResult> packedSizes) {
// Fail on mismatched number of pack sizes.
if (packedSizes.size() != op.getNumLoops()) {
op->emitOpError(
"requires number of packed sizes match the number of loops (")
Expand All @@ -29,12 +28,14 @@ static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
}

rewriter.setInsertionPoint(op);
FailureOr<linalg::PackResult> packResult =
FailureOr<linalg::PackResult> maybePackResult =
linalg::pack(rewriter, op, packedSizes);
if (failed(packResult)) {
if (failed(maybePackResult)) {
op->emitOpError("failed to pack the operation");
return failure();
}

linalg::PackResult packResult = maybePackResult.value();
return packResult;
}

Expand All @@ -60,7 +61,8 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Find the linalg op for packing, currently only consider contraction ops
linalg::LinalgOp linalgOp;
funcOp->walk([&](linalg::LinalgOp op) {
if (linalg::isaContractionOpInterface(op)) {
if (linalg::isaContractionOpInterface(op) ||
isa<linalg::ConvolutionOpInterface>(op.getOperation())) {
linalgOp = op;
return WalkResult::interrupt();
}
Expand All @@ -75,6 +77,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Step 1. Before packing the operation, we will prefetch the lowering and
// packing config.
auto config = getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(linalgOp);

auto packingConfig = getPackingConfig(linalgOp);

if (!config || !packingConfig) {
Expand All @@ -87,6 +90,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Extract packing config from the `linalgOp`.
PackingConfigPackingLevelAttr packCfg =
packingConfig.getPackingConfigVals(packLevel);

if (!packCfg) {
funcOp->emitOpError("failed to get pack config for pack level ")
<< packLevel;
return signalPassFailure();
}
SmallVector<OpFoldResult> packedSizes =
getAsIndexOpFoldResult(context, packCfg.getPackedSizes());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,54 +472,83 @@ static LogicalResult setRootConfigForPadPackPipeline(

static LogicalResult setRootConfigForConvDecomposePipeline(
mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp) {
MLIRContext *context = entryPointFn.getContext();

FailureOr<std::array<uint32_t, 3>> maybeInstructionSize =
getMatmulInstructionSize(linalgOp);
int64_t OW = 4;
int64_t OC = 4;
int64_t IC = 8;
if (succeeded(maybeInstructionSize)) {
auto instructionSize = maybeInstructionSize.value();
OW = instructionSize[0];
OC = instructionSize[1];
IC = instructionSize[2];
auto [m, n, k] = maybeInstructionSize.value();
OW = m;
OC = n;
IC = k;
}

SmallVector<int64_t> transposePackIndices{0, 1, 2};
SmallVector<bool> unpackEmpty{false, false, true};

// Convolution type specific vectors:
SmallVector<SmallVector<int64_t>> innerPerm;
SmallVector<SmallVector<int64_t>> outerPerm;
SmallVector<int64_t> tileSizeLevel0;
SmallVector<int64_t> tileSizeLevel1;
SmallVector<int64_t> tileSizeLevel2;
// Note: some of the tiling dimensions are hardcoded for now.
if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp)) {
// conv_2d_nhwc_hwcf tiling dims: [N, OH, OW, OC, KH, KW, IC].
tileSizeLevel0 = {0, 4, OW, OC, 0, 0, 0};
SmallVector<int64_t> packingSizes;

// [N, OH, OW, OC, KH, KW, IC].
if (isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
// The goal is to pack the input image and kernel as follows, when moving
// from L2 to L1 (example where there are 32 input channels):
// Image: memref<1x3x6x32xbf16> -> memref<1x3x4x6x8xbf16>
// Kernel: memref<3x3x32x4xbf16> -> memref<3x3x4x1x8x4xbf16>
innerPerm = {{}, {{1, 0}}, {}};
outerPerm = {{0, 1, 3, 2}, {}, {0, 1, 2, 3}};
packingSizes = {0, 0, 0, OC, 0, 0, IC};
// Target one column of 4 cores, each core processing a different
// output image row. TODO(newling) use 4x4 array.
// https://github.com/nod-ai/iree-amd-aie/issues/821
tileSizeLevel0 = {1, 4, OW, OC, 0, 0, 0};
tileSizeLevel1 = {1, 1, OW, OC, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, IC};
} else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// conv_2d_nchw_fchw tiling dims: [N, OC, OH, OW, IC, KH, KW].
tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
} else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes:
// scf.for tiling of KH, KW, and (packed) IC dimensions:
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 1, 0, 0};
}

// [N, OC, OH, OW, IC, KH, KW]
else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// The matmul reduction dimension is the input channel (IC) dimension.
// For Conv2DNhwcHwcfOp, this dimension is already the inner-most dimension
// of the input image, and the penultimate dimension of the kernel --
// exactly what we want. For Conv2DNchwFchwOp, can the tensor dimensions be
// permuted in DMA to get them in the correct positions? For the image
// tensor, only if H*W is a nice power of 2 (DMA constraint). For kernels,
// it requires h*w is a nice power of 2 -- unlikely, we typically have
// h=w=3. The dimension permutations will therefore often therefore need to
// be done on the core. We leave this for future work, the expectation for
// now is that models have been transformed at a high level to avoid
// channel-first convolutions.
return linalgOp.emitError(
"Only channel-last convolution supported currently.");
}

// [N, OH, OW, C, KW, HW]
else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes
// =====
//
// An inherent property of depthwise convolutions is that they cannot be
// expressed in terms of matmuls, unlike the above (dense) conv-2ds. The
// tile sizes we choose below are therefore not constrained by the AIE
// matmul instructions.
// A property of depthwise convolution is that it can't be expressed in
// terms of matmul, unlike the above (dense) conv-2ds. The tile sizes we
// choose below are therefore not constrained by AIE matmul instructions.
//
// The logic is currently fragile, and there are no guardrails: there are
// no checks that the data tiles are not too large, or that the input
// dimensions are perfectly tiled by the hard-coded tile dimensions below.
// These will be done as a follow-up task.
//
//
// Below we target a 4x4 array of AIE cores.
auto getElementType = [](Value v) {
return cast<ShapedType>(v.getType()).getElementType();
};
const uint16_t OW_0 = 4;
const uint16_t OH_0 = 4;
const uint16_t OH_1 = 1;

auto operandType = getElementType(linalgOp->getOperand(0));
Expand All @@ -530,26 +559,49 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
OC_0 = maybeMacNumElements.value();
}
// If the operand type has fewer than 32-bits, we really should be able to
// get a mac-width for it Bail because we didn't, and there's probably just
// something missing in the table.
// get a mac-width for it. Bail because we didn't, there's probably just
// something missing in a table.
else if (operandType.getIntOrFloatBitWidth() < 32) {
return linalgOp.emitError(
"has an operand type with fewer than 32-bits, but no mac-width "
"could be determined.");
}

const uint16_t OC_1 = OC_0 / 4;

// depthwise_conv2d_nhwc_hwc tiling dims:
// [N, OH, OW, OC, KH,KW]
tileSizeLevel0 = {1, OH_0, OW_0, OC_0, 0, 0};
packingSizes = {0, 0, 0, OC_1, 0, 0};
innerPerm = {{}, {}, {}};
outerPerm = {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 2, 3}};
// Target one column of 4 cores, each core processing a different
// output image row. TODO(newling) use 4x4 array.
// https://github.com/nod-ai/iree-amd-aie/issues/821
tileSizeLevel0 = {1, 4 * OH_1, OW_0, OC_1, 0, 0};
tileSizeLevel1 = {1, OH_1, OW_0, OC_1, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1};
} else {
assert(false && "Support must be added for this convolution op");
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 0};
}

else {
return linalgOp.emitError(
"unrecognised convolution op, cannot set packing config. ");
}

assert(!innerPerm.empty() && !outerPerm.empty() && !packingSizes.empty() &&
!tileSizeLevel0.empty() && !tileSizeLevel1.empty() &&
"not all vectors for initializing config are non-empty");

auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
context, packingSizes, transposePackIndices, unpackEmpty, innerPerm,
outerPerm);
SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal{
packingConfigLevel1Attr};

auto packingConfigLevels =
PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
auto config = PackingConfigAttr::get(context, packingConfigLevels);
setPackingConfig(linalgOp, config);

TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
tileSizeLevel2};

return setOpConfigAndEntryPointFnTranslation(
entryPointFn, linalgOp, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::Custom);
Expand Down
60 changes: 25 additions & 35 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,16 +408,20 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
bool enableVectorizationPasses,
TilePassPipeline useTilePipeline) {
auto addCleanups = [&]() {
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
};

// First level tiling using scf.forall
{
AMDAIETileAndFuseOptions tileFuseOptions;
tileFuseOptions.tilingLevel = 0;
tileFuseOptions.useSCFFor = false;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
addCleanups();
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Pad the linalg operation
{
Expand All @@ -441,67 +445,50 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
tileFuseOptions.tilingLevel = 1;
tileFuseOptions.useSCFFor = false;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
addCleanups();
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Fuse fill op into the inner forall loop
funcPassManager.addPass(createAMDAIEFuseFillIntoForallPass());
funcPassManager.addPass(createCanonicalizerPass());

// Pad the linalg operation
// Pack the linalg operation
{
AMDAIEPadOptions padOptions;
padOptions.paddingLevel = 1;
funcPassManager.addPass(createAMDAIEPadPass(padOptions));
AMDAIEPackAndTransposeOptions packOptions;
packOptions.packLevel = 0;
funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
}

// Only promote the result to local memory
// Promote the inputs and results to local memory
{
AMDAIEBufferizeToAllocationOptions bufferizeOptions;
bufferizeOptions.memorySpace = 2;
bufferizeOptions.bufferizeOperand = BufferizeOperand::Output;
bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
funcPassManager.addPass(
createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
addCleanups();
}

// Tile the reduction dimension using scf.for
{
AMDAIETileAndFuseOptions tileFuseOptions;
tileFuseOptions.tilingLevel = 2;
tileFuseOptions.useSCFFor = true;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Pad the linalg operation
{
AMDAIEPadOptions padOptions;
padOptions.paddingLevel = 2;
funcPassManager.addPass(createAMDAIEPadPass(padOptions));
}

// Promote the inputs to local memory
{
AMDAIEBufferizeToAllocationOptions bufferizeOptions;
bufferizeOptions.memorySpace = 2;
bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
funcPassManager.addPass(
createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
addCleanups();
}

// Decompose Conv2d ops to Conv1d ops
funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
LinalgFoldUnitExtentDimsPassOptions opts;
opts.useRankReducingSlices = true;
funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(opts));

// Vectorization passes
// FIXME(newling) https://github.com/nod-ai/iree-amd-aie/issues/820
enableVectorizationPasses = false;
appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses);
funcPassManager.addPass(createCanonicalizerPass());

// Comprehensive bufferization
addAMDAIEBufferizePasses(funcPassManager, useTilePipeline);
funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
}

void buildAMDAIETransformPassPipeline(
Expand Down Expand Up @@ -557,6 +544,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEConvertToDmaPass());

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
Expand All @@ -582,6 +570,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
passManager.addPass(createAMDAIEAssignLogicalObjectFifoDepthPass());
passManager.addPass(createAMDAIEAccessToAcquireReleasePass());
passManager.addPass(createAMDAIENoneAccessToTemporaryBufferPass());

passManager.addPass(
createAMDAIEAssignConnectionTypesPass({enablePacketFlow}));
passManager.addPass(createCSEPass());
Expand Down Expand Up @@ -612,6 +601,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEObjFifoBufferizationPass());
passManager.addPass(createAMDAIETemporaryAllocBufferizationPass());
passManager.addPass(createAMDAIEConnectionToFlowPass());
passManager.addPass(createAMDAIEAssignPacketIdsPass());

Expand Down
Loading

0 comments on commit c84cca0

Please sign in to comment.