From bb9194655d67e9a3a7e1df2ef34325a00698f853 Mon Sep 17 00:00:00 2001 From: Nithin Meganathan <18070964+nithinsubbiah@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:32:50 -0800 Subject: [PATCH] [HIP] Adds buffer and allocator implementations (#15791) This patch adds buffer, allocator, and memory pool implementation for the new HIP backend. Progress towards https://github.com/openxla/iree/issues/15790 --- experimental/hip/CMakeLists.txt | 8 +- experimental/hip/api.h | 23 + experimental/hip/dynamic_symbol_tables.h | 18 +- experimental/hip/hip_allocator.c | 602 +++++++++++++++++++++++ experimental/hip/hip_allocator.h | 32 ++ experimental/hip/hip_buffer.c | 171 +++++++ experimental/hip/hip_buffer.h | 68 +++ experimental/hip/memory_pools.c | 304 ++++++++++++ experimental/hip/memory_pools.h | 84 ++++ 9 files changed, 1304 insertions(+), 6 deletions(-) create mode 100644 experimental/hip/hip_allocator.c create mode 100644 experimental/hip/hip_allocator.h create mode 100644 experimental/hip/hip_buffer.c create mode 100644 experimental/hip/hip_buffer.h create mode 100644 experimental/hip/memory_pools.c create mode 100644 experimental/hip/memory_pools.h diff --git a/experimental/hip/CMakeLists.txt b/experimental/hip/CMakeLists.txt index 5fe19142865a..c9a86d55978f 100644 --- a/experimental/hip/CMakeLists.txt +++ b/experimental/hip/CMakeLists.txt @@ -25,7 +25,13 @@ iree_cc_library( "api.h" SRCS "api.h" + "hip_allocator.c" + "hip_allocator.h" + "hip_buffer.c" + "hip_buffer.h" "hip_driver.c" + "memory_pools.c" + "memory_pools.h" INCLUDES "${HIP_API_HEADERS_ROOT}" DEPS @@ -47,8 +53,8 @@ iree_cc_library( TEXTUAL_HDRS "dynamic_symbol_tables.h" SRCS - "hip_headers.h" "dynamic_symbols.c" + "hip_headers.h" "status_util.c" INCLUDES "${HIP_API_HEADERS_ROOT}" diff --git a/experimental/hip/api.h b/experimental/hip/api.h index 1e838598f46c..f3d91726b632 100644 --- a/experimental/hip/api.h +++ b/experimental/hip/api.h @@ -16,6 +16,29 @@ extern "C" { #endif // __cplusplus +//===----------------------------------------------------------------------===// +// iree_hal_hip_device_t +//===----------------------------------------------------------------------===// + +// Parameters defining a hipMemPool_t. +typedef struct iree_hal_hip_memory_pool_params_t { + // Minimum number of bytes to keep in the pool when trimming with + // iree_hal_device_trim. + uint64_t minimum_capacity; + // Soft maximum number of bytes to keep in the pool. + // When more than this is allocated the extra will be freed at the next + // device synchronization in order to remain under the threshold. + uint64_t release_threshold; +} iree_hal_hip_memory_pool_params_t; + +// Parameters for each hipMemPool_t used for queue-ordered allocations. +typedef struct iree_hal_hip_memory_pooling_params_t { + // Used exclusively for DEVICE_LOCAL allocations. + iree_hal_hip_memory_pool_params_t device_local; + // Used for any host-visible/host-local memory types. + iree_hal_hip_memory_pool_params_t other; +} iree_hal_hip_memory_pooling_params_t; + //===----------------------------------------------------------------------===// // iree_hal_hip_driver_t //===----------------------------------------------------------------------===// diff --git a/experimental/hip/dynamic_symbol_tables.h b/experimental/hip/dynamic_symbol_tables.h index e15a38fb6813..a83e29e9aed2 100644 --- a/experimental/hip/dynamic_symbol_tables.h +++ b/experimental/hip/dynamic_symbol_tables.h @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// // HIP symbols //===----------------------------------------------------------------------===// -IREE_HIP_PFN_DECL(hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t) -IREE_HIP_PFN_DECL(hipCtxDestroy, hipCtx_t) -IREE_HIP_PFN_DECL(hipCtxGetDevice, hipDevice_t *) -IREE_HIP_PFN_DECL(hipCtxSetCurrent, hipCtx_t) + IREE_HIP_PFN_DECL(hipDeviceGet, hipDevice_t *, int) IREE_HIP_PFN_DECL(hipDeviceGetAttribute, int *, hipDeviceAttribute_t, int) IREE_HIP_PFN_DECL(hipDeviceGetName, char *, int, hipDevice_t) @@ -24,6 +21,7 @@ IREE_HIP_PFN_DECL(hipEventQuery, hipEvent_t) IREE_HIP_PFN_DECL(hipEventRecord, hipEvent_t, hipStream_t) IREE_HIP_PFN_DECL(hipEventSynchronize, hipEvent_t) IREE_HIP_PFN_DECL(hipFree, void *) +IREE_HIP_PFN_DECL(hipFreeAsync, void *, hipStream_t) IREE_HIP_PFN_DECL(hipFuncSetAttribute, const void *, hipFuncAttribute, int) IREE_HIP_PFN_DECL(hipGetDeviceCount, int *) IREE_HIP_PFN_DECL(hipGetDeviceProperties, hipDeviceProp_t *, int) @@ -33,13 +31,23 @@ IREE_HIP_PFN_STR_DECL(hipGetErrorName, hipError_t) IREE_HIP_PFN_STR_DECL(hipGetErrorString, hipError_t) IREE_HIP_PFN_DECL(hipHostFree, void *) IREE_HIP_PFN_DECL(hipHostGetDevicePointer, void **, void *, unsigned int) +IREE_HIP_PFN_DECL(hipHostMalloc, void **, size_t, unsigned int) +IREE_HIP_PFN_DECL(hipHostRegister, void *, size_t, unsigned int) +IREE_HIP_PFN_DECL(hipHostUnregister, void *) IREE_HIP_PFN_DECL(hipInit, unsigned int) IREE_HIP_PFN_DECL(hipMalloc, void **, size_t) +IREE_HIP_PFN_DECL(hipMallocFromPoolAsync, void **, size_t, hipMemPool_t, + hipStream_t) IREE_HIP_PFN_DECL(hipMallocManaged, hipDeviceptr_t *, size_t, unsigned int) -IREE_HIP_PFN_DECL(hipMemAllocHost, void **, size_t, unsigned int) IREE_HIP_PFN_DECL(hipMemcpy, void *, const void *, size_t, hipMemcpyKind) IREE_HIP_PFN_DECL(hipMemcpyAsync, void *, const void *, size_t, hipMemcpyKind, hipStream_t) +IREE_HIP_PFN_DECL(hipMemPoolCreate, hipMemPool_t *, const hipMemPoolProps *) +IREE_HIP_PFN_DECL(hipMemPoolDestroy, hipMemPool_t) +IREE_HIP_PFN_DECL(hipMemPoolGetAttribute, hipMemPool_t, hipMemPoolAttr, void *) +IREE_HIP_PFN_DECL(hipMemPoolSetAttribute, hipMemPool_t, hipMemPoolAttr, void *) +IREE_HIP_PFN_DECL(hipMemPoolTrimTo, hipMemPool_t, size_t) +IREE_HIP_PFN_DECL(hipMemPrefetchAsync, const void *, size_t, int, hipStream_t) IREE_HIP_PFN_DECL(hipMemset, void *, int, size_t) IREE_HIP_PFN_DECL(hipMemsetAsync, void *, int, size_t, hipStream_t) IREE_HIP_PFN_DECL(hipMemsetD8Async, void *, char, size_t, hipStream_t) diff --git a/experimental/hip/hip_allocator.c b/experimental/hip/hip_allocator.c new file mode 100644 index 000000000000..2e61f4d6a147 --- /dev/null +++ b/experimental/hip/hip_allocator.c @@ -0,0 +1,602 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/hip/hip_allocator.h" + +#include + +#include "experimental/hip/dynamic_symbols.h" +#include "experimental/hip/hip_buffer.h" +#include "experimental/hip/status_util.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_HIP_ALLOCATOR_ID = "HIP unpooled"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +typedef struct iree_hal_hip_allocator_t { + // Abstract resource used for injecting reference counting and vtable; + // must be at offset 0. + iree_hal_resource_t resource; + + // The device that this allocator allocates memory from. + hipDevice_t device; + + // The HIP stream that allocations should be used in. + hipStream_t stream; + + // NOTE: optional depending on device support. + iree_hal_hip_memory_pools_t* pools; + + const iree_hal_hip_dynamic_symbols_t* symbols; + + iree_allocator_t host_allocator; + + // Whether the GPU and CPU can concurrently access HIP managed data in a + // coherent way. We would need to explicitly perform flushing and invalidation + // between GPU and CPU if not. + bool supports_concurrent_managed_access; + + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) +} iree_hal_hip_allocator_t; + +static const iree_hal_allocator_vtable_t iree_hal_hip_allocator_vtable; + +static iree_hal_hip_allocator_t* iree_hal_hip_allocator_cast( + iree_hal_allocator_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_hip_allocator_vtable); + return (iree_hal_hip_allocator_t*)base_value; +} + +iree_status_t iree_hal_hip_allocator_create( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t device, + hipStream_t stream, iree_hal_hip_memory_pools_t* pools, + iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(hip_symbols); + IREE_ASSERT_ARGUMENT(pools); + IREE_ASSERT_ARGUMENT(out_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + // To support device-local + host-visible memory we need concurrent managed + // access indicating that the host and devices can concurrently access the + // device memory. If we don't have this feature then we fall back to forcing + // all device-local + host-visible memory into host-local + device-visible + // page-locked memory. The compiler tries to avoid this for high-traffic + // buffers except for readback staging buffers. + int supports_concurrent_managed_access = 0; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, IREE_HIP_RESULT_TO_STATUS( + hip_symbols, + hipDeviceGetAttribute(&supports_concurrent_managed_access, + hipDeviceAttributeConcurrentManagedAccess, + device), + "hipDeviceGetAttribute")); + + IREE_TRACE_ZONE_APPEND_TEXT( + z0, supports_concurrent_managed_access + ? "has CONCURRENT_MANAGED_ACCESS" + : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on " + "device-local + host-visible memory)"); + + iree_hal_hip_allocator_t* allocator = NULL; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*allocator), + (void**)&allocator)); + iree_hal_resource_initialize(&iree_hal_hip_allocator_vtable, + &allocator->resource); + allocator->device = device; + allocator->stream = stream; + allocator->pools = pools; + allocator->symbols = hip_symbols; + allocator->host_allocator = host_allocator; + allocator->supports_concurrent_managed_access = + supports_concurrent_managed_access != 0; + *out_allocator = (iree_hal_allocator_t*)allocator; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_hip_allocator_destroy( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(allocator->host_allocator, allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_allocator_t iree_hal_hip_allocator_host_allocator( + const iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + iree_hal_hip_allocator_t* allocator = + (iree_hal_hip_allocator_t*)base_allocator; + return allocator->host_allocator; +} + +static iree_status_t iree_hal_hip_allocator_trim( + iree_hal_allocator_t* IREE_RESTRICT base_allocator) { + return iree_ok_status(); +} + +static void iree_hal_hip_allocator_query_statistics( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) { + IREE_STATISTICS({ + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics)); + if (allocator->pools) { + iree_hal_hip_memory_pools_merge_statistics(allocator->pools, + out_statistics); + } + }); +} + +static iree_status_t iree_hal_hip_allocator_query_memory_heaps( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_host_size_t capacity, + iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps, + iree_host_size_t* IREE_RESTRICT out_count) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + + iree_host_size_t count = 3; + if (allocator->supports_concurrent_managed_access) { + ++count; // device-local | host-visible + } + if (out_count) *out_count = count; + if (capacity < count) { + // NOTE: lightweight as this is hit in normal pre-sizing usage. + return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE); + } + + // Don't think there's a query for these. + // Max allocation size may be much smaller in certain memory types such as + // page-locked memory and it'd be good to enforce that. + const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0; + const iree_device_size_t min_alignment = 64; + + int i = 0; + + // Device-local memory (dispatch resources): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .allowed_usage = + IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + if (allocator->supports_concurrent_managed_access) { + // Device-local managed memory with host mapping support: + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + } + + // Write-combined page-locked host-local memory (upload): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + // Cached page-locked host-local memory (download): + heaps[i++] = (iree_hal_allocator_memory_heap_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_COHERENT | + IREE_HAL_MEMORY_TYPE_HOST_CACHED, + .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER | + IREE_HAL_BUFFER_USAGE_DISPATCH | + IREE_HAL_BUFFER_USAGE_MAPPING, + .max_allocation_size = max_allocation_size, + .min_alignment = min_alignment, + }; + + IREE_ASSERT(i == count); + return iree_ok_status(); +} + +static iree_hal_buffer_compatibility_t +iree_hal_hip_allocator_query_buffer_compatibility( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t* IREE_RESTRICT allocation_size) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + + // All buffers can be allocated on the heap. + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + + // Buffers are importable in HIP under most cases, though performance may + // vary wildly. We don't fully verify that the buffer parameters are + // self-consistent and just look at whether we can get a device pointer. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE; + } + + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; + } + } + + // If concurrent managed access is not supported then make device-local + + // host-visible allocations fall back to host-local + device-visible + // page-locked memory. This will be significantly slower for the device to + // access but the compiler only uses this type for readback staging buffers + // and it's better to function than function fast. + if (!allocator->supports_concurrent_managed_access && + iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE; + params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE); + params->type |= + IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE; + } + + // We are now optimal. + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; but + // it can happen in real world use cases. So we should at least not crash. + if (*allocation_size == 0) *allocation_size = 4; + + return compatibility; +} + +static void iree_hal_hip_buffer_free( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, + iree_hal_hip_buffer_type_t buffer_type, hipDeviceptr_t device_ptr, + void* host_ptr) { + IREE_TRACE_ZONE_BEGIN(z0); + switch (buffer_type) { + case IREE_HAL_HIP_BUFFER_TYPE_DEVICE: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "hipFree"); + IREE_HIP_IGNORE_ERROR(hip_symbols, hipFree(device_ptr)); + break; + } + case IREE_HAL_HIP_BUFFER_TYPE_HOST: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "hipHostFree"); + IREE_HIP_IGNORE_ERROR(hip_symbols, hipHostFree(host_ptr)); + break; + } + case IREE_HAL_HIP_BUFFER_TYPE_HOST_REGISTERED: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "hipHostUnregister"); + IREE_HIP_IGNORE_ERROR(hip_symbols, hipHostUnregister(host_ptr)); + break; + } + case IREE_HAL_HIP_BUFFER_TYPE_ASYNC: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)"); + break; + } + case IREE_HAL_HIP_BUFFER_TYPE_EXTERNAL: { + IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; external)"); + break; + } + } + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_hip_allocator_allocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_hip_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_hip_buffer_type_t buffer_type = IREE_HAL_HIP_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + hipDeviceptr_t device_ptr = NULL; + IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_hip_buffer_allocate"); + IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, allocation_size); + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + // Device local case. + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + // Device local and host visible. + buffer_type = IREE_HAL_HIP_BUFFER_TYPE_DEVICE; + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, + hipMallocManaged(&device_ptr, allocation_size, hipMemAttachGlobal)); + if (iree_status_is_ok(status) && + allocator->supports_concurrent_managed_access) { + // Prefetch the buffer on the GPU device. + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, + hipMemPrefetchAsync(device_ptr, allocation_size, allocator->device, + allocator->stream)); + } + host_ptr = (void*)device_ptr; + } else { + // Device only. + buffer_type = IREE_HAL_HIP_BUFFER_TYPE_DEVICE; + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, hipMalloc(&device_ptr, allocation_size)); + } + } else { + // Host local case. + buffer_type = IREE_HAL_HIP_BUFFER_TYPE_HOST; + unsigned int flags = hipHostMallocMapped; + if (!iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { + flags |= hipHostMallocWriteCombined; + } + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, hipHostMalloc(&host_ptr, allocation_size, flags)); + if (iree_status_is_ok(status)) { + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, + hipHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0)); + } + } + IREE_TRACE_ZONE_END(z0); + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_hip_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, allocation_size, + /*byte_offset=*/0, + /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr, + iree_hal_buffer_release_callback_null(), + iree_hal_allocator_host_allocator(base_allocator), &buffer); + } + + if (iree_status_is_ok(status)) { + IREE_TRACE_ALLOC_NAMED(IREE_HAL_HIP_ALLOCATOR_ID, + (void*)iree_hal_hip_buffer_device_pointer(buffer), + allocation_size); + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &allocator->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + if (!buffer && (device_ptr || host_ptr)) { + iree_hal_hip_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static void iree_hal_hip_allocator_deallocate_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT base_buffer) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + + const iree_hal_hip_buffer_type_t buffer_type = + iree_hal_hip_buffer_type(base_buffer); + + iree_hal_hip_buffer_free(allocator->symbols, buffer_type, + iree_hal_hip_buffer_device_pointer(base_buffer), + iree_hal_hip_buffer_host_pointer(base_buffer)); + + switch (buffer_type) { + case IREE_HAL_HIP_BUFFER_TYPE_DEVICE: + case IREE_HAL_HIP_BUFFER_TYPE_HOST: { + IREE_TRACE_FREE_NAMED( + IREE_HAL_HIP_ALLOCATOR_ID, + (void*)iree_hal_hip_buffer_device_pointer(base_buffer)); + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); + break; + } + default: + // Buffer type not tracked. + break; + } + + iree_hal_buffer_destroy(base_buffer); +} + +static iree_status_t iree_hal_hip_allocator_import_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + const iree_hal_buffer_params_t* IREE_RESTRICT params, + iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, + iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_hip_allocator_t* allocator = + iree_hal_hip_allocator_cast(base_allocator); + + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_device_size_t allocation_size = external_buffer->size; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_hip_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + iree_status_t status = iree_ok_status(); + iree_hal_hip_buffer_type_t buffer_type = IREE_HAL_HIP_BUFFER_TYPE_DEVICE; + void* host_ptr = NULL; + hipDeviceptr_t device_ptr = NULL; + + switch (external_buffer->type) { + case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: { + if (iree_all_bits_set(compat_params.type, + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "unable to register host allocations as device-local memory"); + } + buffer_type = IREE_HAL_HIP_BUFFER_TYPE_HOST_REGISTERED; + host_ptr = external_buffer->handle.host_allocation.ptr; + uint32_t register_flags = hipHostRegisterMapped; + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, + hipHostRegister(host_ptr, external_buffer->size, register_flags), + "hipHostRegister"); + if (iree_status_is_ok(status)) { + status = IREE_HIP_RESULT_TO_STATUS( + allocator->symbols, + hipHostGetDevicePointer(&device_ptr, host_ptr, 0), + "hipHostGetDevicePointer"); + } + break; + } + case IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION: { + buffer_type = IREE_HAL_HIP_BUFFER_TYPE_EXTERNAL; + device_ptr = + (hipDeviceptr_t)external_buffer->handle.device_allocation.ptr; + break; + } + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD: + case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "handle-based imports not yet implemented"); + default: + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "external buffer type not supported"); + } + + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + status = iree_hal_hip_buffer_wrap( + base_allocator, compat_params.type, compat_params.access, + compat_params.usage, external_buffer->size, + /*byte_offset=*/0, + /*byte_length=*/external_buffer->size, buffer_type, device_ptr, + host_ptr, release_callback, + iree_hal_allocator_host_allocator(base_allocator), &buffer); + } + + if (iree_status_is_ok(status)) { + *out_buffer = buffer; + } else { + if (!buffer && (device_ptr || host_ptr)) { + iree_hal_hip_buffer_free(allocator->symbols, buffer_type, device_ptr, + host_ptr); + } else { + iree_hal_buffer_release(buffer); + } + } + return status; +} + +static iree_status_t iree_hal_hip_allocator_export_buffer( + iree_hal_allocator_t* IREE_RESTRICT base_allocator, + iree_hal_buffer_t* IREE_RESTRICT buffer, + iree_hal_external_buffer_type_t requested_type, + iree_hal_external_buffer_flags_t requested_flags, + iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) { + iree_hal_hip_buffer_type_t buffer_type = iree_hal_hip_buffer_type(buffer); + + switch (requested_type) { + case IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION: + switch (buffer_type) { + case IREE_HAL_HIP_BUFFER_TYPE_EXTERNAL: + out_external_buffer->flags = requested_flags; + out_external_buffer->type = requested_type; + out_external_buffer->handle.device_allocation.ptr = + ((uint64_t)(uintptr_t)iree_hal_hip_buffer_device_pointer(buffer)); + out_external_buffer->size = iree_hal_buffer_allocation_size(buffer); + return iree_ok_status(); + + default: + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "HIP buffer type is not supported for " + "export as an external device allocation"); + } + + default: + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "external buffer type not supported"); + } +} + +static const iree_hal_allocator_vtable_t iree_hal_hip_allocator_vtable = { + .destroy = iree_hal_hip_allocator_destroy, + .host_allocator = iree_hal_hip_allocator_host_allocator, + .trim = iree_hal_hip_allocator_trim, + .query_statistics = iree_hal_hip_allocator_query_statistics, + .query_memory_heaps = iree_hal_hip_allocator_query_memory_heaps, + .query_buffer_compatibility = + iree_hal_hip_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_hip_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_hip_allocator_deallocate_buffer, + .import_buffer = iree_hal_hip_allocator_import_buffer, + .export_buffer = iree_hal_hip_allocator_export_buffer, +}; diff --git a/experimental/hip/hip_allocator.h b/experimental/hip/hip_allocator.h new file mode 100644 index 000000000000..1e73855f0f3d --- /dev/null +++ b/experimental/hip/hip_allocator.h @@ -0,0 +1,32 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_EXPERIMENTAL_HIP_ALLOCATOR_H_ +#define IREE_EXPERIMENTAL_HIP_ALLOCATOR_H_ + +#include "experimental/hip/memory_pools.h" +#include "experimental/hip/status_util.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a HIP memory allocator. +// |device| and |stream| will be used for management operations. +// |pools| provides memory pools that may be shared across multiple allocators +// and the pointer must remain valid for the lifetime of the allocator. +iree_status_t iree_hal_hip_allocator_create( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t device, + hipStream_t stream, iree_hal_hip_memory_pools_t* pools, + iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_EXPERIMENTAL_HIP_ALLOCATOR_H_ diff --git a/experimental/hip/hip_buffer.c b/experimental/hip/hip_buffer.c new file mode 100644 index 000000000000..60ea11a97520 --- /dev/null +++ b/experimental/hip/hip_buffer.c @@ -0,0 +1,171 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/hip/hip_buffer.h" + +#include +#include +#include + +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +typedef struct iree_hal_hip_buffer_t { + iree_hal_buffer_t base; + iree_hal_hip_buffer_type_t type; + void* host_ptr; + hipDeviceptr_t device_ptr; + iree_hal_buffer_release_callback_t release_callback; +} iree_hal_hip_buffer_t; + +static const iree_hal_buffer_vtable_t iree_hal_hip_buffer_vtable; + +static iree_hal_hip_buffer_t* iree_hal_hip_buffer_cast( + iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_hip_buffer_vtable); + return (iree_hal_hip_buffer_t*)base_value; +} + +static const iree_hal_hip_buffer_t* iree_hal_hip_buffer_const_cast( + const iree_hal_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_hip_buffer_vtable); + return (const iree_hal_hip_buffer_t*)base_value; +} + +iree_status_t iree_hal_hip_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_hip_buffer_type_t buffer_type, hipDeviceptr_t device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(out_buffer); + if (!host_ptr && iree_any_bit_set(allowed_usage, + IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT | + IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "mappable buffers require host pointers"); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_hip_buffer_t* buffer = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); + if (iree_status_is_ok(status)) { + iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, + allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, + &iree_hal_hip_buffer_vtable, &buffer->base); + buffer->type = buffer_type; + buffer->host_ptr = host_ptr; + buffer->device_ptr = device_ptr; + buffer->release_callback = release_callback; + *out_buffer = &buffer->base; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_hip_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_hip_buffer_t* buffer = iree_hal_hip_buffer_cast(base_buffer); + iree_allocator_t host_allocator = base_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + if (buffer->release_callback.fn) { + buffer->release_callback.fn(buffer->release_callback.user_data, + base_buffer); + } + iree_allocator_free(host_allocator, buffer); + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_hip_buffer_map_range( + iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + iree_hal_hip_buffer_t* buffer = iree_hal_hip_buffer_cast(base_buffer); + + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type(base_buffer), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage(base_buffer), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + + uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset; + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for + // heap buffers we could reallocate them such that ASAN yells, but that + // would only work if the entire buffer was discarded. +#ifndef NDEBUG + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); + } +#endif // !NDEBUG + + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return iree_ok_status(); +} + +static iree_status_t iree_hal_hip_buffer_unmap_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + // Nothing to do today. + return iree_ok_status(); +} + +static iree_status_t iree_hal_hip_buffer_invalidate_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do today. + return iree_ok_status(); +} + +static iree_status_t iree_hal_hip_buffer_flush_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // Nothing to do today. + return iree_ok_status(); +} + +iree_hal_hip_buffer_type_t iree_hal_hip_buffer_type( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_hip_buffer_t* buffer = + iree_hal_hip_buffer_const_cast(base_buffer); + return buffer->type; +} + +hipDeviceptr_t iree_hal_hip_buffer_device_pointer( + const iree_hal_buffer_t* base_buffer) { + const iree_hal_hip_buffer_t* buffer = + iree_hal_hip_buffer_const_cast(base_buffer); + return buffer->device_ptr; +} + +void* iree_hal_hip_buffer_host_pointer(const iree_hal_buffer_t* base_buffer) { + const iree_hal_hip_buffer_t* buffer = + iree_hal_hip_buffer_const_cast(base_buffer); + return buffer->host_ptr; +} + +void iree_hal_hip_buffer_drop_release_callback(iree_hal_buffer_t* base_buffer) { + iree_hal_hip_buffer_t* buffer = iree_hal_hip_buffer_cast(base_buffer); + buffer->release_callback = iree_hal_buffer_release_callback_null(); +} + +static const iree_hal_buffer_vtable_t iree_hal_hip_buffer_vtable = { + .recycle = iree_hal_buffer_recycle, + .destroy = iree_hal_hip_buffer_destroy, + .map_range = iree_hal_hip_buffer_map_range, + .unmap_range = iree_hal_hip_buffer_unmap_range, + .invalidate_range = iree_hal_hip_buffer_invalidate_range, + .flush_range = iree_hal_hip_buffer_flush_range, +}; diff --git a/experimental/hip/hip_buffer.h b/experimental/hip/hip_buffer.h new file mode 100644 index 000000000000..dff6836b14f1 --- /dev/null +++ b/experimental/hip/hip_buffer.h @@ -0,0 +1,68 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_EXPERIMENTAL_HIP_BUFFER_H_ +#define IREE_EXPERIMENTAL_HIP_BUFFER_H_ + +#include "experimental/hip/hip_headers.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef enum iree_hal_hip_buffer_type_e { + // Device local buffer; allocated with hipMalloc/hipMallocManaged, freed + // with hipFree. + IREE_HAL_HIP_BUFFER_TYPE_DEVICE = 0, + // Host local buffer; allocated with hipHostMalloc, freed with hipHostFree. + IREE_HAL_HIP_BUFFER_TYPE_HOST, + // Host local buffer; registered with hipHostRegister, freed with + // hipHostUnregister. + IREE_HAL_HIP_BUFFER_TYPE_HOST_REGISTERED, + // Device local buffer, allocated with hipMallocFromPoolAsync, freed with + // hipFree/hipFreeAsync. + IREE_HAL_HIP_BUFFER_TYPE_ASYNC, + // Externally registered buffer whose providence is unknown. + // Must be freed by the user. + IREE_HAL_HIP_BUFFER_TYPE_EXTERNAL, +} iree_hal_hip_buffer_type_t; + +// Wraps a HIP allocation in an iree_hal_buffer_t. +iree_status_t iree_hal_hip_buffer_wrap( + iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_hip_buffer_type_t buffer_type, hipDeviceptr_t device_ptr, + void* host_ptr, iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); + +// Returns the underlying HIP buffer type. +iree_hal_hip_buffer_type_t iree_hal_hip_buffer_type( + const iree_hal_buffer_t* buffer); + +// Returns the HIP base pointer for the given |buffer|. +// This is the entire allocated_buffer and must be offset by the buffer +// byte_offset and byte_length when used. +hipDeviceptr_t iree_hal_hip_buffer_device_pointer( + const iree_hal_buffer_t* buffer); + +// Returns the HIP host pointer for the given |buffer|, if available. +void* iree_hal_hip_buffer_host_pointer(const iree_hal_buffer_t* buffer); + +// Drops the release callback so that when the buffer is destroyed no callback +// will be made. This is not thread safe but all callers are expected to be +// holding an allocation and the earliest the buffer could be destroyed is after +// this call returns and the caller has released its reference. +void iree_hal_hip_buffer_drop_release_callback(iree_hal_buffer_t* buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_EXPERIMENTAL_HIP_BUFFER_H_ diff --git a/experimental/hip/memory_pools.c b/experimental/hip/memory_pools.c new file mode 100644 index 000000000000..b6c35d097ef0 --- /dev/null +++ b/experimental/hip/memory_pools.c @@ -0,0 +1,304 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "experimental/hip/memory_pools.h" + +#include "experimental/hip/dynamic_symbols.h" +#include "experimental/hip/hip_buffer.h" +#include "experimental/hip/status_util.h" + +// NOTE: these are currently global for all devices; we could make +// device-specific ones by malloc() and leaking (with LSAN note) unique string +// values instead. +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_HIP_DEVICE_LOCAL_POOL_RESERVED_ID = + "HIP pool: device-local reserved"; +static const char* IREE_HAL_HIP_OTHER_POOL_RESERVED_ID = + "HIP pool: other reserved"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +static iree_status_t iree_hal_hip_create_memory_pool( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t hip_device, + iree_hal_hip_memory_pool_params_t params, + hipMemPool_t* IREE_RESTRICT out_pool) { + *out_pool = NULL; + + hipMemPoolProps pool_props = { + .allocType = hipMemAllocationTypePinned, + // TODO: allow sharing of certain pool memory types by fd/HANDLE. + .handleTypes = hipMemHandleTypeNone, + .location = + { + .type = hipMemLocationTypeDevice, + .id = hip_device, + }, + .reserved = {0}, + .win32SecurityAttributes = NULL, + }; + + hipMemPool_t pool = NULL; + + IREE_HIP_RETURN_IF_ERROR(hip_symbols, hipMemPoolCreate(&pool, &pool_props), + "hipMemPoolCreate"); + iree_status_t status = IREE_HIP_RESULT_TO_STATUS( + hip_symbols, + hipMemPoolSetAttribute(pool, hipMemPoolAttrReleaseThreshold, + ¶ms.release_threshold), + "hipMemPoolSetAttribute"); + + if (iree_status_is_ok(status)) { + *out_pool = pool; + } else { + IREE_HIP_IGNORE_ERROR(hip_symbols, hipMemPoolDestroy(pool)); + } + return status; +} + +iree_status_t iree_hal_hip_memory_pools_initialize( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t hip_device, + const iree_hal_hip_memory_pooling_params_t* pooling_params, + iree_allocator_t host_allocator, + iree_hal_hip_memory_pools_t* IREE_RESTRICT out_pools) { + IREE_ASSERT_ARGUMENT(hip_symbols); + IREE_ASSERT_ARGUMENT(pooling_params); + IREE_ASSERT_ARGUMENT(out_pools); + IREE_TRACE_ZONE_BEGIN(z0); + + memset(out_pools, 0, sizeof(*out_pools)); + out_pools->hip_symbols = hip_symbols; + out_pools->host_allocator = host_allocator; + + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + status = iree_hal_hip_create_memory_pool(hip_symbols, hip_device, + pooling_params->device_local, + &out_pools->device_local); + } + + if (iree_status_is_ok(status)) { + status = iree_hal_hip_create_memory_pool( + hip_symbols, hip_device, pooling_params->other, &out_pools->other); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +void iree_hal_hip_memory_pools_deinitialize( + iree_hal_hip_memory_pools_t* pools) { + IREE_TRACE_ZONE_BEGIN(z0); + + if (pools->device_local) { + IREE_HIP_IGNORE_ERROR(pools->hip_symbols, + hipMemPoolDestroy(pools->device_local)); + pools->device_local = NULL; + } + + if (pools->other) { + IREE_HIP_IGNORE_ERROR(pools->hip_symbols, hipMemPoolDestroy(pools->other)); + pools->other = NULL; + } + + IREE_TRACE_ZONE_END(z0); +} + +static void iree_hal_hip_memory_pool_track_alloc( + iree_hal_hip_memory_pools_t* pools, iree_hal_buffer_t* buffer) { + bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); + (void)is_device_local; + iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer); + (void)allocation_size; + IREE_TRACE_ALLOC_NAMED( + is_device_local ? IREE_HAL_HIP_DEVICE_LOCAL_POOL_RESERVED_ID + : IREE_HAL_HIP_OTHER_POOL_RESERVED_ID, + (void*)iree_hal_hip_buffer_device_pointer(buffer), allocation_size); + IREE_STATISTICS({ + iree_atomic_int64_t* bytes_allocated = + is_device_local ? &pools->statistics.device_bytes_allocated + : &pools->statistics.host_bytes_allocated; + iree_atomic_fetch_add_int64(bytes_allocated, allocation_size, + iree_memory_order_relaxed); + }); +} + +static void iree_hal_hip_memory_pool_track_free( + iree_hal_hip_memory_pools_t* pools, iree_hal_buffer_t* buffer) { + bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), + IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); + (void)is_device_local; + IREE_TRACE_FREE_NAMED(is_device_local + ? IREE_HAL_HIP_DEVICE_LOCAL_POOL_RESERVED_ID + : IREE_HAL_HIP_OTHER_POOL_RESERVED_ID, + (void*)iree_hal_hip_buffer_device_pointer(buffer)); + IREE_STATISTICS({ + iree_atomic_int64_t* bytes_freed = + is_device_local ? &pools->statistics.device_bytes_freed + : &pools->statistics.host_bytes_freed; + iree_device_size_t allocation_size = + iree_hal_buffer_allocation_size(buffer); + iree_atomic_fetch_add_int64(bytes_freed, allocation_size, + iree_memory_order_relaxed); + }); +} + +void iree_hal_hip_memory_pools_merge_statistics( + iree_hal_hip_memory_pools_t* pools, + iree_hal_allocator_statistics_t* statistics) { + IREE_STATISTICS({ + statistics->device_bytes_allocated = iree_atomic_load_int64( + &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed); + statistics->host_bytes_allocated = iree_atomic_load_int64( + &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed); + statistics->device_bytes_freed = iree_atomic_load_int64( + &pools->statistics.device_bytes_freed, iree_memory_order_relaxed); + statistics->host_bytes_freed = iree_atomic_load_int64( + &pools->statistics.host_bytes_freed, iree_memory_order_relaxed); + + if (pools->device_local) { + uint64_t pool_peak = 0; + IREE_HIP_IGNORE_ERROR( + pools->hip_symbols, + hipMemPoolGetAttribute(pools->device_local, hipMemPoolAttrUsedMemHigh, + &pool_peak)); + statistics->device_bytes_peak += (iree_device_size_t)pool_peak; + } + if (pools->other) { + uint64_t pool_peak = 0; + IREE_HIP_IGNORE_ERROR( + pools->hip_symbols, + hipMemPoolGetAttribute(pools->other, hipMemPoolAttrUsedMemHigh, + &pool_peak)); + statistics->host_bytes_peak += (iree_device_size_t)pool_peak; + } + }); +} + +iree_status_t iree_hal_hip_memory_pools_trim( + iree_hal_hip_memory_pools_t* pools, + const iree_hal_hip_memory_pooling_params_t* pooling_params) { + IREE_HIP_RETURN_IF_ERROR( + pools->hip_symbols, + hipMemPoolTrimTo(pools->device_local, + pooling_params->device_local.minimum_capacity), + "hipMemPoolTrimTo"); + IREE_HIP_RETURN_IF_ERROR( + pools->hip_symbols, + hipMemPoolTrimTo(pools->other, pooling_params->other.minimum_capacity), + "hipMemPoolTrimTo"); + return iree_ok_status(); +} + +// NOTE: this is only issued if the buffer is destroyed without having had been +// scheduled for deallocation asynchronously. When a buffer is scheduled we drop +// the release callback so that this isn't called and we don't double-free. +static void iree_hal_hip_async_buffer_release_callback( + void* user_data, iree_hal_buffer_t* buffer) { + iree_hal_hip_memory_pools_t* pools = (iree_hal_hip_memory_pools_t*)user_data; + IREE_TRACE_ZONE_BEGIN(z0); + + hipDeviceptr_t device_ptr = iree_hal_hip_buffer_device_pointer(buffer); + IREE_HIP_IGNORE_ERROR(pools->hip_symbols, hipFree(device_ptr)); + iree_hal_hip_memory_pool_track_free(pools, buffer); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_hip_memory_pools_allocate( + iree_hal_hip_memory_pools_t* pools, hipStream_t stream, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)allocation_size); + + iree_hal_buffer_params_canonicalize(¶ms); + + // TODO: more pools and better selection; this is coarsely deciding between + // only device local (variables, constants, transients) and other (staging, + // external) but could use more buffer properties (including usage/export + // flags) to better isolate the different usage patterns and keep the pools + // operating with reasonable limits. We should be using the |pool| arg. + hipMemPool_t memory_pool = + iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL) + ? pools->device_local + : pools->other; + + hipDeviceptr_t device_ptr = NULL; + iree_status_t status = IREE_HIP_RESULT_TO_STATUS( + pools->hip_symbols, + hipMallocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool, + stream), + "hipMallocFromPoolAsync"); + + // Wrap the allocated HIP buffer in a HAL buffer. + // NOTE: we don't provide a device allocator because we didn't allocate from + // one and instead we use a release callback to perform the free if the user + // doesn't dealloca the buffer. + iree_hal_buffer_t* buffer = NULL; + if (iree_status_is_ok(status)) { + iree_hal_buffer_release_callback_t release_callback = { + .fn = iree_hal_hip_async_buffer_release_callback, + .user_data = pools, + }; + status = iree_hal_hip_buffer_wrap( + /*device_allocator=*/NULL, params.type, params.access, params.usage, + allocation_size, /*byte_offset=*/0, + /*byte_length=*/allocation_size, IREE_HAL_HIP_BUFFER_TYPE_ASYNC, + device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator, + &buffer); + } + + if (iree_status_is_ok(status)) { + // Update statistics (note that it may not yet be accurate). + iree_hal_hip_memory_pool_track_alloc(pools, buffer); + *out_buffer = buffer; + } else if (buffer) { + iree_hal_buffer_release(buffer); + } else { + IREE_HIP_IGNORE_ERROR(pools->hip_symbols, hipFreeAsync(device_ptr, stream)); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +iree_status_t iree_hal_hip_memory_pools_deallocate( + iree_hal_hip_memory_pools_t* pools, hipStream_t stream, + iree_hal_buffer_t* buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_TRACE_ZONE_APPEND_VALUE_I64( + z0, (int64_t)iree_hal_buffer_allocation_size(buffer)); + + // Only process the request if the buffer came from an async pool. + // We may get requests for deallocations on ones that didn't if one part of + // the application allocated the buffer synchronously and another deallocated + // it asynchronously. + iree_status_t status = iree_ok_status(); + if (iree_hal_hip_buffer_type(buffer) == IREE_HAL_HIP_BUFFER_TYPE_ASYNC) { + // Try to schedule the buffer for freeing. + hipDeviceptr_t device_ptr = iree_hal_hip_buffer_device_pointer(buffer); + status = IREE_HIP_RESULT_TO_STATUS( + pools->hip_symbols, hipFreeAsync(device_ptr, stream), "hipFreeAsync"); + if (iree_status_is_ok(status)) { + // Drop the release callback so that we don't try to double-free the + // buffer. Note that we only do this if the HIP free succeeded as + // otherwise we still need to synchronously deallocate the buffer when it + // is destroyed. + iree_hal_hip_buffer_drop_release_callback(buffer); + + // Update statistics (note that it may not yet be accurate). + iree_hal_hip_memory_pool_track_free(pools, buffer); + } + } else { + // Not allocated via alloca, ignore. + IREE_TRACE_ZONE_APPEND_TEXT(z0, "ignored sync allocation"); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} diff --git a/experimental/hip/memory_pools.h b/experimental/hip/memory_pools.h new file mode 100644 index 000000000000..5e3b605be3a5 --- /dev/null +++ b/experimental/hip/memory_pools.h @@ -0,0 +1,84 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_EXPERIMENTAL_HIP_MEMORY_POOLS_H_ +#define IREE_EXPERIMENTAL_HIP_MEMORY_POOLS_H_ + +#include "experimental/hip/api.h" +#include "experimental/hip/dynamic_symbols.h" +#include "experimental/hip/hip_headers.h" +#include "iree/base/api.h" +#include "iree/base/internal/atomics.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// WARNING: hipMemPool API's are marked as beta in HIP library meaning +// that while the feature is complete, it is still open to changes and may +// have outstanding issues. +// The following API's are used in this module: hipMallocFromPoolAsync, +// hipMemPoolCreate, hipMemPoolDestroy, hipMemPoolGetAttribute, +// hipMemPoolSetAttribute, hipMemPoolTrimTo + +// Retained HIP memory pools for various allocation types. +typedef struct iree_hal_hip_memory_pools_t { + // Used exclusively for DEVICE_LOCAL allocations. + hipMemPool_t device_local; + // Used for any host-visible/host-local memory types. + hipMemPool_t other; + + const iree_hal_hip_dynamic_symbols_t* hip_symbols; + iree_allocator_t host_allocator; + + IREE_STATISTICS(struct { + iree_atomic_int64_t device_bytes_allocated; + iree_atomic_int64_t device_bytes_freed; + iree_atomic_int64_t host_bytes_allocated; + iree_atomic_int64_t host_bytes_freed; + } statistics;) +} iree_hal_hip_memory_pools_t; + +// Initializes |out_pools| by configuring new HIP memory pools. +iree_status_t iree_hal_hip_memory_pools_initialize( + const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t hip_device, + const iree_hal_hip_memory_pooling_params_t* pooling_params, + iree_allocator_t host_allocator, + iree_hal_hip_memory_pools_t* IREE_RESTRICT out_pools); + +// Deinitializes the |pools| and releases the underlying HIP resources. +void iree_hal_hip_memory_pools_deinitialize(iree_hal_hip_memory_pools_t* pools); + +// Merges statistics information from |pools| into |statistics|. +void iree_hal_hip_memory_pools_merge_statistics( + iree_hal_hip_memory_pools_t* pools, + iree_hal_allocator_statistics_t* statistics); + +// Trims all memory pools by releasing resources back to the system. +iree_status_t iree_hal_hip_memory_pools_trim( + iree_hal_hip_memory_pools_t* pools, + const iree_hal_hip_memory_pooling_params_t* pooling_params); + +// Asynchronously allocates a buffer from an appropriate pool. +// The allocation will be stream-ordered on |stream|. +iree_status_t iree_hal_hip_memory_pools_allocate( + iree_hal_hip_memory_pools_t* pools, hipStream_t stream, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer); + +// Asynchronously deallocates a buffer from its pool. +// The deallocation will be stream-ordered on |stream|. +iree_status_t iree_hal_hip_memory_pools_deallocate( + iree_hal_hip_memory_pools_t* pools, hipStream_t stream, + iree_hal_buffer_t* buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_EXPERIMENTAL_HIP_MEMORY_POOLS_H_