Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Vectorization][ObjectFifo] Enable larger Matmul + Truncf #856

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
17f6bcc
[Insert-Loops-For-Vec] Update insert-loops-for-vectorization pass
Abhishek-Varma Oct 22, 2024
d53d30e
Disable AIR tests
Abhishek-Varma Oct 28, 2024
ab1c09c
[WIP] Enable vectorization after bufferization (insert-cores)
Abhishek-Varma Oct 14, 2024
5a46ed2
Fix conv pipeline failure + temporarily remove insert_loops test
Abhishek-Varma Oct 14, 2024
1a02a64
Reenable insert-loops test
Abhishek-Varma Oct 14, 2024
248465a
Fix more AIR e2e
Abhishek-Varma Oct 15, 2024
4f6698f
Add lit test for subview->vector.transfer_read/transfer_write
Abhishek-Varma Oct 23, 2024
95c937f
[WIP] Create a function outlining pass
Abhishek-Varma Oct 9, 2024
95143b9
Subsume the compute op and form the function body
Abhishek-Varma Oct 29, 2024
b98d502
After insert-cores fix
Abhishek-Varma Oct 15, 2024
a036c81
Update lower-to-aie for outlined function
Abhishek-Varma Oct 16, 2024
f9a2f6b
Add lit test
Abhishek-Varma Oct 23, 2024
f581744
Add e2e test for larger Martmul + truncf
Abhishek-Varma Oct 23, 2024
9a56fa7
Disable Outlining for Conv
Abhishek-Varma Oct 28, 2024
a342fdc
Fix canonicalization for trivial access read/write subview->vec
Abhishek-Varma Oct 28, 2024
f0bcd45
Fix e2e test
Abhishek-Varma Oct 28, 2024
ee1e49b
Disable few more AIR related tests from run_matmul_test.sh
Abhishek-Varma Oct 28, 2024
cf1d483
Nit change
Abhishek-Varma Oct 28, 2024
89fc4ca
Review comments for Canonicalization pattern
Abhishek-Varma Oct 29, 2024
5397238
Review comments for FuncOutlining
Abhishek-Varma Oct 29, 2024
1e7f4fe
Review comments InsertLoops
Abhishek-Varma Oct 29, 2024
3a2a7a7
Review comment of disabling AIR test
Abhishek-Varma Oct 29, 2024
c6de3ea
Add e2e lit test of Matmul + Truncf
Abhishek-Varma Oct 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 78 additions & 50 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def aie_vs_llvm_cpu(
test_file,
use_ukernel=False,
tile_pipeline="pad-pack",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
function_name=None,
seed=1,
rtol=1e-6,
Expand Down Expand Up @@ -675,58 +675,61 @@ def run(self, config):
test_files_dir = config.file_dir / "test_files"
output_dir = config.output_dir

# AIR tests have been disabled currently due to change of infra to :-
# bufferization -> insert-cores -> vectorization (@erwei-xilinx)
# Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
test_name = output_dir / "test_from_template_full_bias.mlir"
template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
generate_matmul_test(test_name, template_name, 128, 128, 256, "i32", "i32")
aie_vs_llvm_cpu(
config,
test_name,
tile_pipeline="pack-peel",
lower_to_aie_pipeline="air",
rtol=0,
atol=0,
)

if config.xdna_datetime and config.xdna_datetime < 20240801:
for name in [
"two_matmul_switching",
"matmul_f32_8_8_4",
"matmul_f32_8_4_8",
]:
aie_vs_llvm_cpu(config, test_files_dir / f"{name}.mlir")

aie_vs_llvm_cpu(
config,
test_files_dir / "three_matmuls.mlir",
function_name="three_$mm$",
)
# test_name = output_dir / "test_from_template_full_bias.mlir"
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
# template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
# generate_matmul_test(test_name, template_name, 128, 128, 256, "i32", "i32")
# aie_vs_llvm_cpu(
# config,
# test_name,
# tile_pipeline="pack-peel",
# lower_to_aie_pipeline="air",
# rtol=0,
# atol=0,
# )

# if config.xdna_datetime and config.xdna_datetime < 20240801:
# for name in [
# "two_matmul_switching",
# "matmul_f32_8_8_4",
# "matmul_f32_8_4_8",
# ]:
# aie_vs_llvm_cpu(config, test_files_dir / f"{name}.mlir", lower_to_aie_pipeline="air")

# aie_vs_llvm_cpu(
# config,
# test_files_dir / "three_matmuls.mlir",
# lower_to_aie_pipeline="air",
# function_name="three_$mm$",
# )

# Test(s) of the form matmul(A,B) where A:MxK, B:KxN
test_name = output_dir / "test_from_template.mlir"
template_name = matmul_template_dir / "matmul_MxK_KxN.mlir"
generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32")
aie_vs_llvm_cpu(config, test_name)
# test_name = output_dir / "test_from_template.mlir"
# template_name = matmul_template_dir / "matmul_MxK_KxN.mlir"
# generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32")
# aie_vs_llvm_cpu(config, test_name, lower_to_aie_pipeline="air")

# Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
test_name = output_dir / "test_from_template_bias_N.mlir"
template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
generate_matmul_test(test_name, template_name, 1024, 1024, 512, "bf16", "f32")
if config.vitis_dir:
aie_vs_llvm_cpu(
config,
test_name,
tile_pipeline="pack-peel",
lower_to_aie_pipeline="air",
use_ukernel=True,
)
aie_vs_llvm_cpu(
config,
test_name,
tile_pipeline="pack-peel",
lower_to_aie_pipeline="air",
use_ukernel=False,
)
# test_name = output_dir / "test_from_template_bias_N.mlir"
# template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
# generate_matmul_test(test_name, template_name, 1024, 1024, 512, "bf16", "f32")
# if config.vitis_dir:
# aie_vs_llvm_cpu(
# config,
# test_name,
# tile_pipeline="pack-peel",
# lower_to_aie_pipeline="air",
# use_ukernel=True,
# )
# aie_vs_llvm_cpu(
# config,
# test_name,
# tile_pipeline="pack-peel",
# lower_to_aie_pipeline="air",
# use_ukernel=False,
# )

# Test(s) of the form batch_matmul(A,B) where A:BxMxK, B:BxKxN
template_name = matmul_template_dir / "batch_matmul_BxMxK_BxKxN.mlir"
Expand Down Expand Up @@ -783,6 +786,31 @@ def run(self, config):
output_type=get_output_type(test_name),
)

# Large shape Matmul + Truncf
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I don't think it's a large shape.

Related question: does even larger shape work as well? You don't need to add more ci tests here, but I'm just curious of what's the largest shape that can work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, larger shapes like 1024x1024x1024 and 512x2048x4096 (MxNxK) all work.

But shapes like 1536x1536x2048 and 4096x4096x2048 (MxNxK) suffer from PM issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, maybe good to remove the above comment of "large shape".

generate_matmul_test(test_name, template_name, 128, 128, 256, "bf16", "f32")
identity_mat = np.eye(128, dtype=np.float32)
lhs_ones = np.ones(128 * 256, dtype=np.float32).reshape([128, 256])
rhs_ones = np.ones(256 * 128, dtype=np.float32).reshape([256, 128])
out_ones = np.ones(128 * 128, dtype=np.float32).reshape([128, 128])
lhs = lhs_ones * 2
rhs = rhs_ones * 3
input_args = generate_inputs(test_name, output_dir, 1, {1: lhs, 2: rhs})
aie_vs_baseline(
config,
test_name,
input_args,
out_ones * 1536, # exected output
use_ukernel=False,
tile_pipeline="pack-peel",
lower_to_aie_pipeline="objectFifo",
function_name=None,
seed=1,
rtol=0,
atol=0,
n_repeats=1,
output_type=get_output_type(test_name),
)


class SmokeSet(TestSet):
def __init__(self):
Expand All @@ -793,8 +821,8 @@ def run(self, config):
output_dir = config.output_dir

# The most basic test, direct from .mlir file using all defaults
test_files_dir = file_dir / "test_files"
aie_vs_llvm_cpu(config, test_files_dir / "matmul_int32.mlir")
# test_files_dir = file_dir / "test_files"
# aie_vs_llvm_cpu(config, test_files_dir / "matmul_int32.mlir")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make default objectFifo?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made objectFifo default and have changed these tests to use air pipeline explicitly, albeit commented.


# Using objectFifo pipeline
test_name = output_dir / "test_from_template.mlir"
Expand Down
30 changes: 16 additions & 14 deletions build_tools/ci/run_matmul_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -593,20 +593,22 @@ run_matmul_test \
--acc_type "i32" \
--m "64" --n "64" --k "128"

run_matmul_test \
--name_prefix "packPeel_bf16" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--m "512" --n "512" --k "512"

run_matmul_test \
--name_prefix "packPeel_t_bf16" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--m "128" --n "256" --k "512" \
--do_transpose_rhs "1"
# AIR tests have been disabled currently due to change of infra to :-
# bufferization -> insert-cores -> vectorization (@erwei-xilinx)
# run_matmul_test \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments on why this is disabled.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think all the AIR tests are failed because @Abhishek-Varma changed the order of bufferization and vectorization?

FYI @erwei-xilinx may want to take a look at the changes and see if he could do anything to make pack-peel pipeline work for AIR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I am asking to add a comment in the code so someone reading it a few months from now (potentially), knows why this was disabled.

# --name_prefix "packPeel_bf16" \
# --tile_pipeline "pack-peel" \
# --lhs_rhs_type "bf16" \
# --acc_type "f32" \
# --m "512" --n "512" --k "512"

# run_matmul_test \
# --name_prefix "packPeel_t_bf16" \
# --tile_pipeline "pack-peel" \
# --lhs_rhs_type "bf16" \
# --acc_type "f32" \
# --m "128" --n "256" --k "512" \
# --do_transpose_rhs "1"

###################################################################
# ObjectFifo Matmul tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
Expand All @@ -39,6 +40,92 @@ namespace mlir::iree_compiler::aievec {

using namespace mlir;

/// Utility to check if the indices provided are all 0.
static LogicalResult isAllZeroOffsetAccess(mlir::OperandRange indices) {
if (!llvm::all_of(indices, [](Value val) {
IntegerAttr attr;
if (!matchPattern(val, m_Constant(&attr))) return false;
return attr.getInt() == 0;
})) {
return failure();
}
return success();
}

/// Utility to fetch indices of Subview op which would be used by a new vector
/// transfer_read/transfer_write op with trivial access pattern.
static SmallVector<Value> fetchNewIndices(PatternRewriter &rewriter,
Location loc,
memref::SubViewOp subViewOp) {
SmallVector<Value> newIndices;
for (OpFoldResult offset : subViewOp.getMixedOffsets()) {
Value indexVal;
if (auto attr = dyn_cast<Attribute>(offset)) {
indexVal = rewriter.create<arith::ConstantIndexOp>(
loc, cast<IntegerAttr>(attr).getInt());
} else {
indexVal = cast<Value>(offset);
}
newIndices.push_back(indexVal);
}
return newIndices;
}

/// A rewriter function to canonicalize the following :-
/// INPUT:
/// %b = memref.subview %a [offset0, offset1, ...]
/// %c = vector.transfer_read %b[0, 0, ...]
/// OUTPUT:
/// %c = vector.transfer_read %a[offset0, offset1, ...]
///
/// This is needed to enable other set of staged canonicalizations in this pass.
struct CanonicalizeTrivialReadAccessSubviewOpPattern
jtuyls marked this conversation as resolved.
Show resolved Hide resolved
: public OpRewritePattern<vector::TransferReadOp> {
using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
PatternRewriter &rewriter) const override {
auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
readOp.getSource().getDefiningOp());
if (!subViewOp) return failure();
if (failed(isAllZeroOffsetAccess(readOp.getIndices()))) return failure();
SmallVector<Value> newIndices =
fetchNewIndices(rewriter, readOp.getLoc(), subViewOp);
rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
readOp, readOp.getType(), subViewOp.getSource(), newIndices,
readOp.getPadding(), readOp.getInBoundsValues());
return success();
}
};

/// A rewriter function to canonicalize the following :-
/// INPUT:
/// %b = memref.subview %a [offset0, offset1, ...]
/// vector.transfer_write %val, %b[0, 0, ...]
/// OUTPUT:
/// vector.transfer_write %val, %a[offset0, offset1, ...]
///
/// This is needed to enable other set of staged canonicalizations in this pass.
struct CanonicalizeTrivialWriteAccessSubviewOpPattern
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
: public OpRewritePattern<vector::TransferWriteOp> {
using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
PatternRewriter &rewriter) const override {
auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
writeOp.getSource().getDefiningOp());
if (!subViewOp) return failure();
if (failed(isAllZeroOffsetAccess(writeOp.getIndices()))) return failure();
SmallVector<Value> newIndices =
fetchNewIndices(rewriter, writeOp.getLoc(), subViewOp);
rewriter.create<vector::TransferWriteOp>(
writeOp.getLoc(), writeOp.getVector(), subViewOp.getSource(),
newIndices, writeOp.getInBoundsValues());
rewriter.eraseOp(writeOp);
return success();
}
};

static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
if (op.getKind() != vector::CombiningKind::ADD) return false;

Expand Down Expand Up @@ -628,6 +715,12 @@ struct CanonicalizeVectorForAIEVecPass
auto op = getOperation();
MLIRContext *context = &getContext();

{
RewritePatternSet patterns(context);
patterns.add<CanonicalizeTrivialReadAccessSubviewOpPattern,
CanonicalizeTrivialWriteAccessSubviewOpPattern>(context);
(void)applyPatternsAndFoldGreedily(op, std::move(patterns));
}
{
// These must run before 'populateVectorBroadcastLoweringPatterns'
// so that broadcasts can be matched before conversion to insert.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,42 @@ func.func @arith_truncf(%inp: vector<2x3xf32>) -> vector<2x3xbf16> {
%0 = arith.truncf %inp : vector<2x3xf32> to vector<2x3xbf16>
return %0 : vector<2x3xbf16>
}

// -----

// CHECK-LABEL: @trivial_read_access
// CHECK-SAME: (%[[ARG0:.*]]: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>)
// CHECK-NOT: memref.subview
// CHECK: %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
// CHECK-SAME: into memref<1024xbf16, strided<[1]>>
// CHECK: %[[READ:.*]] = vector.transfer_read %[[COLLAPSE_SHAPE]]
// CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[READ]]
// CHECK-SAME: vector<32xbf16> to vector<1x1x4x8xbf16>
// CHECK: return %[[SHAPE_CAST]]
func.func @trivial_read_access(%arg0: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>) -> vector<1x1x4x8xbf16> {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : bf16
%subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>> to memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: 608>>
%read = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: 608>>, vector<1x1x4x8xbf16>
return %read : vector<1x1x4x8xbf16>
}

// -----

// CHECK-LABEL: @trivial_write_access
// CHECK-SAME: (%[[ARG0:.*]]: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>,
// CHECK-SAME: %[[ARG1:.*]]: vector<1x1x4x4xf32>)
// CHECK-NOT: memref.subview
// CHECK: %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
// CHECK-SAME: : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> into memref<1024xf32, strided<[1]>>
// CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG1]]
// CHECK-SAME: : vector<1x1x4x4xf32> to vector<16xf32>
// CHECK: vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE_SHAPE]]
// CHECK: return
func.func @trivial_write_access(%arg0: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>, %arg1: vector<1x1x4x4xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : bf16
%subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> to memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
vector.transfer_write %arg1, %subview[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x4x4xf32>, memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
return
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,32 @@

iree_lit_test_suite(
NAME
lit
lit_objectFifo
SRCS
"conv2d_nhwc_objectfifo_e2e.mlir"
"matmul_elementwise_pack_peel_air_e2e.mlir"
"matmul_pack_peel_air_e2e.mlir"
"matmul_pack_peel_objectfifo.mlir"
"matmul_pack_peel_objectfifo_e2e.mlir"
"matmul_pad_pack_air_e2e.mlir"
"matmul_elementwise_pack_peel_objectfifo_e2e.mlir"
"xdna_oplib_plugin.mlir"
TOOLS
${IREE_LLD_TARGET}
FileCheck
iree-opt
iree-compile
)

# AIR tests have been disabled currently due to change of infra to :-
# bufferization -> insert-cores -> vectorization (@erwei-xilinx)
# iree_lit_test_suite(
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
# NAME
# lit_air
# SRCS
# "matmul_elementwise_pack_peel_air_e2e.mlir"
# "matmul_pack_peel_air_e2e.mlir"
# "matmul_pad_pack_air_e2e.mlir"
# TOOLS
# ${IREE_LLD_TARGET}
# FileCheck
# iree-opt
# iree-compile
# )
Loading
Loading