Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge kernels in existing XCLBIN #418

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
run: |
python3 -m venv .venv
source .venv/bin/activate
pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061222+3ac9566-py3-none-manylinux_2_35_x86_64.whl
pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061622+18c8815-py3-none-manylinux_2_35_x86_64.whl

pip install -r tests/matmul/requirements.txt

Expand Down
3 changes: 3 additions & 0 deletions build_tools/ci/cpu_comparison/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ function run_test() {
run_test \
--test_file ${THIS_DIR}/test_files/matmul_int32.mlir

run_test \
--test_file ${THIS_DIR}/test_files/three_matmuls.mlir

run_test \
--name_prefix "matmul" \
--lhs_rhs_type "bf16" \
Expand Down
31 changes: 31 additions & 0 deletions build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// This test shows arbitrary matmuls that would have producer consumer relationships
// across different dispatches running on CI.

// These lines are strictly required by the script which generates input data:
//
// input 32x32xf32
// input 32x32xf32
// input 32x4xf32
// input 4x32xf32

!A_TYPE = tensor<32x32xf32>
!B_TYPE = tensor<32x4xf32>
!C_TYPE = tensor <4x32xf32>
!D_TYPE = tensor <4x4xf32>
func.func @two_mm(%lhs : !A_TYPE,
%rhs : !A_TYPE, %rhs_2 : !B_TYPE, %lhs_2 : !C_TYPE) -> !D_TYPE {
%empty = tensor.empty() : !A_TYPE
%empty_2 = tensor.empty() : !B_TYPE
%empty_3 = tensor.empty() : !D_TYPE
%cst = arith.constant 0.0 : f32
%fill = linalg.fill ins(%cst : f32) outs(%empty : !A_TYPE) -> !A_TYPE
%fill_2 = linalg.fill ins(%cst : f32) outs(%empty_2 : !B_TYPE) -> !B_TYPE
%fill_3 = linalg.fill ins(%cst : f32) outs(%empty_3 : !D_TYPE) -> !D_TYPE
%2 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !A_TYPE)
outs(%fill : !A_TYPE) -> !A_TYPE
%3 = linalg.matmul ins(%2, %rhs_2 : !A_TYPE, !B_TYPE)
outs(%fill_2 : !B_TYPE) -> !B_TYPE
%4 = linalg.matmul ins(%lhs_2, %3 : !C_TYPE, !B_TYPE)
outs(%fill_3 : !D_TYPE) -> !D_TYPE
return %4 : !D_TYPE
}
69 changes: 52 additions & 17 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ LogicalResult AIETargetBackend::serializeExecutable(
SmallVector<uint32_t> xclbinIndices(ordinalCount);
SmallVector<uint32_t> asmInstrIndices(ordinalCount);

SmallVector<SmallString<128>> xclbinPaths;

for (size_t i = 0; i < entryPointNames.size(); i++) {
uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]);

Expand Down Expand Up @@ -300,18 +302,34 @@ LogicalResult AIETargetBackend::serializeExecutable(
llvm::sys::path::append(npuInstPath,
entryPointNamesFb[ordinal] + ".npu.txt");

SmallVector<StringRef> cmdArgs{aie2xclbin,
inputMlirPath,
"--peano",
options.peanoInstallDir,
"--xclbin-name",
xclbinPath,
"--npu-insts-name",
npuInstPath,
"--xclbin-kernel-name",
entryPointNamesFb[ordinal],
"--tmpdir",
entryPointWorkDir};
// Convert ordinal to hexadecimal string for xclbin kernel id.
std::stringstream ss;
ss << "0x" << std::hex << ordinal + 10;
std::string ordinalHex = ss.str();

SmallVector<StringRef> cmdArgs;
SmallVector<StringRef> cmdArgsBase{aie2xclbin,
inputMlirPath,
"--peano",
options.peanoInstallDir,
"--xclbin-name",
xclbinPath,
"--npu-insts-name",
npuInstPath,
"--xclbin-kernel-name",
entryPointNamesFb[ordinal],
"--tmpdir",
entryPointWorkDir,
"--xclbin-kernel-id",
ordinalHex};
cmdArgs = cmdArgsBase;
bool attemptingMerge = false;
if (i > 0) {
cmdArgs.push_back("--input-xclbin-name");
cmdArgs.push_back(xclbinPaths.back());
attemptingMerge = true;
}
xclbinPaths.push_back(xclbinPath);

auto addOpt = [&](StringRef arg, bool value) {
if (value) cmdArgs.push_back(arg);
Expand Down Expand Up @@ -350,11 +368,24 @@ LogicalResult AIETargetBackend::serializeExecutable(
{
SmallVector<StringRef> cmdEnvRefs{cmdEnv.begin(), cmdEnv.end()};
int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs);
if (result != 0)
if (result != 0 && attemptingMerge) {
// we failed to create xclbin but maybe we failed becuase we were trying
// to merge the kerenel in exisiting xclbin, try again to see if perhaps
// we have success if we dont try to merge.
attemptingMerge = false;
result =
llvm::sys::ExecuteAndWait(cmdArgsBase[0], cmdArgsBase, cmdEnvRefs);
xclbinPaths.push_back(xclbinPath);
}
if (result != 0) {
return moduleOp.emitOpError(
"Failed to produce an XCLBin with external tool.");
}
// delete the previous xclbin if we were able to merge as the new one now
// will have all the kernels from the previous one.
if (attemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2);
xclbinIndices[ordinal] = xclbinPaths.size() - 1;
}

std::ifstream instrFile(static_cast<std::string>(npuInstPath));
std::string line;
while (std::getline(instrFile, line)) {
Expand All @@ -369,17 +400,21 @@ LogicalResult AIETargetBackend::serializeExecutable(
asmInstrIndices[ordinal] = asmInstrRefs.size();
asmInstrRefs.push_back(
iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec));

}
// Write out the final xclbins to flatbuffer.
for (auto xclbinPath : xclbinPaths) {
llvm::outs() << "writing xclbin from path: " << xclbinPath << "\n";
std::string errorMessage;
xclbinIn = openInputFile(xclbinPath, &errorMessage);
if (!xclbinIn) {
moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage;
}
auto xclbinStringRef = builder.createString(xclbinIn->getBuffer());
xclbinIndices[ordinal] = xclbinRefs.size();
xclbinRefs.push_back(
iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef));
}
// Serialize the executable to flatbuffer format

// Serialize the executable to flatbuffer format.
auto entryPointsRef = builder.createStringVec(entryPointNamesFb);

iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef);
Expand Down
26 changes: 18 additions & 8 deletions runtime/src/iree-amd-aie/driver/xrt/native_executable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ iree_status_t iree_hal_xrt_native_executable_create(
iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec =
iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def);

iree_host_size_t number_xclbin =
iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins_vec);

iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec =
iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def);

Expand Down Expand Up @@ -163,17 +166,15 @@ iree_status_t iree_hal_xrt_native_executable_create(
&executable->resource);
executable->host_allocator = host_allocator;
executable->entry_point_count = entry_point_count;
for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count;
entry_ordinal++) {
const char* entry_name =
flatbuffers_string_vec_at(entry_points_vec, entry_ordinal);
uint32_t xclbin_index =
flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal);
// collect all the hardware contexts first as muliple entry points can map to
// the same context and this way we dont need to keep reloading them.
std::vector<xrt::hw_context> contexts;
for (iree_host_size_t xclbin_index = 0; xclbin_index < number_xclbin;
xclbin_index++) {
iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def =
iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index);
flatbuffers_string_t xclbin_fb =
iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def);

// XRT API needs this vector and cant actually read a void*.
std::vector<char> xclbinVector(
xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb));
Expand All @@ -186,6 +187,14 @@ iree_status_t iree_hal_xrt_native_executable_create(
}
device.register_xclbin(xclbin);
xrt::hw_context context(device, xclbin.get_uuid());
contexts.push_back(context);
}
for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count;
entry_ordinal++) {
const char* entry_name =
flatbuffers_string_vec_at(entry_points_vec, entry_ordinal);
uint32_t xclbin_index =
flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal);
uint32_t asm_instr_index =
flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal);
iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def =
Expand All @@ -196,7 +205,8 @@ iree_status_t iree_hal_xrt_native_executable_create(
std::unique_ptr<xrt::kernel> kernel;
std::unique_ptr<xrt::bo> instr;
try {
kernel = std::make_unique<xrt::kernel>(context, entry_name);
kernel =
std::make_unique<xrt::kernel>(contexts[xclbin_index], entry_name);
// XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction
// buffer that resides in instr_memory. This buffer is always passed as
// the second argument to the kernel and we can use the
Expand Down
Loading