Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Enable runtime gpu_arch auto-select based on devices where kernels are executing for gemm_int4 tests; enable device-specific compilation using USE_XETLA (xe_lpg, xe_hpg, xe_hpc). #302

Open
wants to merge 1 commit into
base: xetla
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,30 @@ set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -enableBCR")
# Optimization to reduce the tokens used for DPAS instruction.
set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -DPASTokenReduction")

# USE_XETLA - Align to IPEX logic
if(USE_XETLA) # A quoted string always evaluates to false unless: The string's value is one of the true constants
string(REPLACE "," ";" USE_XETLA ${USE_XETLA})
message("The used archs are: ${USE_XETLA}")
elseif(NOT USE_XETLA) # if(<variable>): True if given a variable that is defined to a value that is not a false constant
message("No archs specified. Stopping CMake execution here.")
set(USE_XETLA "")
endif()

set(XETLA_AVAILABLE_ARCHS xe_hpc xe_hpg xe_lpg)
set(USE_XETLA_XE_LPG OFF)
set(USE_XETLA_XE_HPG OFF)
set(USE_XETLA_XE_HPC OFF)

foreach(used_arch IN LISTS USE_XETLA)
if (used_arch IN_LIST XETLA_AVAILABLE_ARCHS)
string(TOUPPER "${used_arch}" arch_upper)
set(USE_XETLA_${arch_upper} ON)
message(STATUS "XeTLA: Found arch from list: ${arch_upper}")
else()
message(FATAL_ERROR "Unexpected XeTLA architecture: ${used_arch}")
endif()
endforeach()

# AOT device
set(USE_AOT_DEVLIST "" CACHE STRING "Set device list for AOT build")
if (USE_AOT_DEVLIST)
Expand Down
38 changes: 26 additions & 12 deletions tests/integration/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,29 @@ function(add_integration_test target "host_cpp")
# target_link_libraries(${TARGET} PUBLIC MKL::MKL_SYCL)
endfunction()

# add_subdirectory(vector_add)
add_subdirectory(gemm)
add_subdirectory(gemv)
add_subdirectory(row_reduction)
add_subdirectory(layer_norm)
add_subdirectory(data_transformer)
add_subdirectory(default_config)
add_subdirectory(sg_dropout_op)
add_subdirectory(limitation)
add_subdirectory(softmax)
add_subdirectory(fmha)
add_subdirectory(col_major_shuf)
if (USE_XETLA_XE_LPG)
add_subdirectory(vector_add)
add_subdirectory(gemm)
# add_subdirectory(row_reduction)
# add_subdirectory(layer_norm)
# add_subdirectory(data_transformer)
# add_subdirectory(default_config)
# add_subdirectory(sg_dropout_op)
add_subdirectory(limitation)
# add_subdirectory(softmax)
add_subdirectory(fmha)
add_subdirectory(col_major_shuf)
else()
# add_subdirectory(vector_add)
add_subdirectory(gemm)
add_subdirectory(gemv)
add_subdirectory(row_reduction)
add_subdirectory(layer_norm)
add_subdirectory(data_transformer)
add_subdirectory(default_config)
add_subdirectory(sg_dropout_op)
add_subdirectory(limitation)
add_subdirectory(softmax)
add_subdirectory(fmha)
add_subdirectory(col_major_shuf)
endif()
26 changes: 15 additions & 11 deletions tests/integration/gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
include_directories(${CMAKE_SOURCE_DIR}/tests/integration/gemm)

add_subdirectory(bf16)
add_subdirectory(stream_k)
add_subdirectory(fp16)
add_subdirectory(fp32)
add_subdirectory(int8_quantization)
add_subdirectory(int8)
add_subdirectory(tf32)
add_subdirectory(int4_dequantization)
add_subdirectory(int4_dequantization_bias)
add_subdirectory(unaligned_bf16)
if (USE_XETLA_XE_LPG)
add_subdirectory(int4_dequantization)
add_subdirectory(int4_dequantization_bias)
else()
add_subdirectory(bf16)
add_subdirectory(stream_k)
add_subdirectory(fp16)
add_subdirectory(fp32)
add_subdirectory(int8_quantization)
add_subdirectory(int8)
add_subdirectory(tf32)
add_subdirectory(int4_dequantization)
add_subdirectory(int4_dequantization_bias)
add_subdirectory(unaligned_bf16)
endif()
104 changes: 97 additions & 7 deletions tests/integration/gemm/int4_dequantization/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,11 @@ class last {
using data_type_c = fp16;
};

template <class Test>
template <class Test, gpu_arch x, mma_engine y>
class KernalName {

};
template <class Test, gpu_arch x, mma_engine y>
void dequantize_gemm_run(uint32_t iter) {
using namespace gpu;
// Accept incoming parameters
Expand Down Expand Up @@ -238,16 +242,16 @@ void dequantize_gemm_run(uint32_t iter) {
data_type_scale,
data_type_zero_pt,
quant_info,
mma_engine::xmx,
gpu_arch::XeHpg>;
y,
x>;
using gemm_t = xetla::group::
gemm_t<compute_policy, tile_shape, mem_desc_a_t, mem_desc_b_t>;

using epilogue_t = xetla::group::epilogue_t<
xetla::group::epilogue_policy_default<gpu_arch::XeHpg>,
xetla::group::epilogue_policy_default<x>,
tile_shape,
mem_desc_c_t>;
using group_swizzle = xetla::kernel::group_swizzle_default<gpu_arch::XeHpg>;
using group_swizzle = xetla::kernel::group_swizzle_default<x>;
using gemm_op_t = xetla::kernel::gemm_universal_t<
gpu::xetla::kernel::dispatch_policy_int4_dequantize_kslicing<
group_swizzle,
Expand Down Expand Up @@ -366,7 +370,7 @@ void dequantize_gemm_run(uint32_t iter) {
for (uint32_t i = 0; i < iter; i++) {
prof.cpu_start();
auto e_esimd = queue.submit([&](handler& cgh) {
cgh.parallel_for<Test>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
cgh.parallel_for<KernalName<Test,x,y>>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
// allocate slm and nbarrier resource
slm_barrier_init<gemm_op_t>();
gemm_op_t gemm_op;
Expand Down Expand Up @@ -433,8 +437,94 @@ template <typename T>
class dequantize_gemm_test : public ::testing::Test {};
TYPED_TEST_SUITE_P(dequantize_gemm_test);

template <template<gpu_arch, mma_engine, class T> class F, class G>
class dispatch_arch_test
{
using T_RET = std::invoke_result_t<decltype(F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec)>;

public:
template <typename... Args>
static T_RET exec(Args&&... args) {
// save default formatting
std::ios fmt_bak(nullptr);
fmt_bak.copyfmt(std::cout);

sycl::device device;
if (!device.has(aspect::ext_intel_device_id))
throw std::runtime_error("Can not get device ID");
auto deviceID = device.get_info<ext::intel::info::device::device_id>();
std::cout << "deviceID: 0x" << std::hex //
<< std::right << std::setfill('0') << deviceID << "\n";

// restore default formatting
std::cout.copyfmt(fmt_bak);
#if defined(SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE) && \
SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE
// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc#feature-test-macro
try {
namespace ENS = sycl::ext::oneapi::experimental;
auto deviceArch = device.get_info<ENS::info::device::architecture>();
switch (deviceArch) {
case ENS::architecture::intel_gpu_pvc:
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
return;
case ENS::architecture::intel_gpu_dg2_g10:
case ENS::architecture::intel_gpu_dg2_g11:
case ENS::architecture::intel_gpu_dg2_g12:
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
default:
break;
}
}
catch (...) {
std::cout << "Execption occurred! Please check one api versions.";
}
#endif
std::cout << "No matching architecture, checking device ID ...\n";
switch (deviceID) {
// MTL devices
case 0x7d55: // Intel® Arc ™ Graphics
std::cout << "MTL devices identified!" << std::endl;
return F<gpu_arch::XeLpg, mma_engine::fpu, G>::exec(std::forward<Args>(args)...);
// DG2 devices
case 0x56a0: // Intel® Arc ™ A770 Graphics
case 0x56a1: // Intel® Arc ™ A750 Graphics
case 0x56a2: // Intel® Arc ™ A580 Graphics
case 0x5690: // Intel® Arc ™ A770M Graphics
case 0x5691: // Intel® Arc ™ A730M Graphics
case 0x5692: // Intel® Arc ™ A550M Graphics
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
// PVC devices
case 0x0bda: //
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
default:
std::cout << "Unknown device ID \n";
break;
}

if (device.has(aspect::ext_intel_gpu_eu_simd_width))
throw std::runtime_error("Can not get eu_simd_width");
auto eu_simd_width =
device.get_info<ext::intel::info::device::gpu_eu_simd_width>();
if (eu_simd_width == 8) {
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
} else if (eu_simd_width == 16) {
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
} else {
throw std::runtime_error("Can not get device ID");
}
}
};

template <gpu_arch arch_tag, mma_engine engine_tag, typename T>
struct main_wrapper {
static constexpr auto exec = []() {
dequantize_gemm_run<T, arch_tag, engine_tag>(ITER);
};
};

TYPED_TEST_P(dequantize_gemm_test, esimd) {
dequantize_gemm_run<TypeParam>(ITER);
dispatch_arch_test<main_wrapper, TypeParam>::exec();
}

REGISTER_TYPED_TEST_SUITE_P(dequantize_gemm_test, esimd);
Expand Down
11 changes: 7 additions & 4 deletions tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ set(ProjectIdXe ${ProjectId})
string(PREPEND ProjectIdClient "gemm_client_")
string(PREPEND ProjectIdXe "gemm_xe_")

FILE(GLOB src_client main_client.cpp)
add_integration_test(${ProjectIdClient} ${src_client})
FILE(GLOB src_xe main_xe.cpp)
add_integration_test(${ProjectIdXe} ${src_xe})
if (USE_XETLA_XE_LPG)
FILE(GLOB src_client main_client.cpp)
add_integration_test(${ProjectIdClient} ${src_client})
else()
FILE(GLOB src_xe main_xe.cpp)
add_integration_test(${ProjectIdXe} ${src_xe})
endif()
12 changes: 8 additions & 4 deletions tests/integration/vector_add/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
add_subdirectory(tf32_1d)
add_subdirectory(bf16_2d)
add_subdirectory(int32_1d)
add_subdirectory(int32_2d)
if (USE_XETLA_XE_LPG)
add_subdirectory(int32_1d)
else()
add_subdirectory(tf32_1d)
add_subdirectory(bf16_2d)
add_subdirectory(int32_1d)
add_subdirectory(int32_2d)
endif()
67 changes: 45 additions & 22 deletions tests/unit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,48 @@ function(add_unit_test target kernel_func_file test_host)
set_tests_properties(${TARGET} PROPERTIES LABELS "unit" TIMEOUT ${UNIT_TIMEOUT})
endfunction()

add_subdirectory(global_load_store)
add_subdirectory(global_atomic)
add_subdirectory(block_load_store)
add_subdirectory(tile_load_store)
add_subdirectory(tile_load_store_local)
add_subdirectory(internal_type_load_store_cvt)
add_subdirectory(local_load_store)
add_subdirectory(raw_send)
add_subdirectory(buff_compare)
add_subdirectory(tile_mma)
add_subdirectory(named_barrier)
add_subdirectory(tile_row_reduction)
add_subdirectory(add_c)
add_subdirectory(imul)
add_subdirectory(philox_rng)
add_subdirectory(exp_inv_sqrt_tanh)
add_subdirectory(reg_layout_conversion)
add_subdirectory(reg_reduce)
add_subdirectory(math_general)
add_subdirectory(epilogue_tile_op)
add_subdirectory(bit_mask_manipulation)

if (USE_XETLA_XE_LPG)
add_subdirectory(global_load_store)
add_subdirectory(global_atomic)
# add_subdirectory(block_load_store)
# add_subdirectory(tile_load_store)
# add_subdirectory(tile_load_store_local)
# add_subdirectory(internal_type_load_store_cvt)
add_subdirectory(local_load_store)
# add_subdirectory(raw_send)
add_subdirectory(buff_compare)
# add_subdirectory(tile_mma)
# add_subdirectory(named_barrier)
# add_subdirectory(tile_row_reduction)
add_subdirectory(add_c)
add_subdirectory(imul)
add_subdirectory(philox_rng)
# add_subdirectory(exp_inv_sqrt_tanh)
# add_subdirectory(reg_layout_conversion)
add_subdirectory(reg_reduce)
add_subdirectory(math_general)
# add_subdirectory(epilogue_tile_op)
# add_subdirectory(bit_mask_manipulation)
else()
add_subdirectory(global_load_store)
add_subdirectory(global_atomic)
add_subdirectory(block_load_store)
add_subdirectory(tile_load_store)
add_subdirectory(tile_load_store_local)
add_subdirectory(internal_type_load_store_cvt)
add_subdirectory(local_load_store)
add_subdirectory(raw_send)
add_subdirectory(buff_compare)
add_subdirectory(tile_mma)
add_subdirectory(named_barrier)
add_subdirectory(tile_row_reduction)
add_subdirectory(add_c)
add_subdirectory(imul)
add_subdirectory(philox_rng)
add_subdirectory(exp_inv_sqrt_tanh)
add_subdirectory(reg_layout_conversion)
add_subdirectory(reg_reduce)
add_subdirectory(math_general)
add_subdirectory(epilogue_tile_op)
add_subdirectory(bit_mask_manipulation)
endif()
Loading