From b313a9b2d2c1047bee3467a312192bcb71ce8cb9 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 22 Aug 2023 16:53:29 -0700 Subject: [PATCH] [cuda] Optimize device signal to host wait synchronization When we have a device `CUevent` signaling past a desired timepoint waited by the host, we can actually wait on the `CUevent` using `cuEventSynchronize`. This is profiled to be quite faster than waiting on the full flow of device singal -> cuLaunchHostFunc (async) -> host signal -> host polling. --- experimental/cuda2/README.md | 4 +- experimental/cuda2/event_semaphore.c | 76 +++++++++++++++++++++------- experimental/cuda2/event_semaphore.h | 2 +- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/experimental/cuda2/README.md b/experimental/cuda2/README.md index 7825c755e736..1f2e8fbb0c89 100644 --- a/experimental/cuda2/README.md +++ b/experimental/cuda2/README.md @@ -107,7 +107,9 @@ instead: * We keep track of all GPU signals in the timeline. Once we see a GPU wait request, try to scan the timeline to find a GPU signal that advances the - timeline past the desired value, and use that for waiting instead. + timeline past the desired value, and use that for waiting instead. (This + actually applies to CPU waits too, and it's an optimization over pure + CPU side `iree_event_t` polling.) * We may not see GPU signal before seeing GPU wait requests, then we can also keep track of all GPU waits in the timeline. Later once see either a CPU signal or GPU signal advancing past the waited value, we can handle them diff --git a/experimental/cuda2/event_semaphore.c b/experimental/cuda2/event_semaphore.c index 0bddc8a83747..212bb3e13824 100644 --- a/experimental/cuda2/event_semaphore.c +++ b/experimental/cuda2/event_semaphore.c @@ -225,6 +225,37 @@ static iree_status_t iree_hal_cuda2_semaphore_acquire_timepoint_host_wait( return iree_ok_status(); } +// Acquires an iree_hal_cuda2_event_t object to wait on the host for the +// timeline to reach at least the given |min_value| on the device. Returns +// Returns IREE_STATUS_OK and writes to |out_event| if we can find such an +// event; returns IREE_STATUS_NOT_FOUND otherwise. The caller should release +// the |out_event| once done. +static iree_status_t iree_hal_cuda2_semaphore_acquire_event_host_wait( + iree_hal_cuda2_semaphore_t* semaphore, uint64_t min_value, + iree_hal_cuda2_event_t** out_event) { + *out_event = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + // Scan through the timepoint list and try to find a device event signal to + // wait on. We need to lock with the timepoint list mutex here. + iree_slim_mutex_lock(&semaphore->base.timepoint_mutex); + for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head; + tp != NULL; tp = tp->next) { + iree_hal_cuda2_timepoint_t* signal_timepoint = + (iree_hal_cuda2_timepoint_t*)tp; + if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL && + signal_timepoint->base.minimum_value >= min_value) { + *out_event = signal_timepoint->timepoint.device_signal; + iree_hal_cuda2_event_retain(*out_event); + break; + } + } + iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex); + + IREE_TRACE_ZONE_END(z0); + return iree_make_status(*out_event ? IREE_STATUS_OK : IREE_STATUS_NOT_FOUND); +} + static iree_status_t iree_hal_cuda2_semaphore_wait( iree_hal_semaphore_t* base_semaphore, uint64_t value, iree_timeout_t timeout) { @@ -255,10 +286,26 @@ static iree_status_t iree_hal_cuda2_semaphore_wait( iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout); + // Slow path: try to see if we can have a device CUevent to wait on. This + // should happen outside of the lock given that acquiring has its own internal + // locks. This is faster than waiting on a host timepoint. + iree_hal_cuda2_event_t* wait_event = NULL; + iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait( + semaphore, value, &wait_event); + if (iree_status_is_ok(status)) { + IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR( + z0, semaphore->symbols, + cuEventSynchronize(iree_hal_cuda2_event_handle(wait_event)), + "cuEventSynchronize"); + iree_hal_cuda2_event_release(wait_event); + IREE_TRACE_ZONE_END(z0); + return status; + } + // Slow path: acquire a timepoint. This should happen outside of the lock to // given that acquiring has its own internal locks. iree_hal_cuda2_timepoint_t* timepoint = NULL; - iree_status_t status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait( + status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait( semaphore, value, timeout, &timepoint); if (IREE_UNLIKELY(!iree_status_is_ok(status))) { IREE_TRACE_ZONE_END(z0); @@ -380,25 +427,16 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait( }, &wait_timepoint->base); - // Scan through the timepoint list and try to find a device event signal to - // wait on. We need to lock with the timepoint list mutex here. - iree_slim_mutex_lock(&semaphore->base.timepoint_mutex); - for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head; - tp != NULL; tp = tp->next) { - iree_hal_cuda2_timepoint_t* signal_timepoint = - (iree_hal_cuda2_timepoint_t*)tp; - if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL && - signal_timepoint->base.minimum_value >= min_value) { - // We've found an existing signal timepoint to wait on; we don't need a - // standalone wait timepoint anymore. Decrease its refcount before - // overwriting it to return it back to the pool and retain the new one. - iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait); - iree_hal_cuda2_event_t* event = signal_timepoint->timepoint.device_signal; - iree_hal_cuda2_event_retain(event); - wait_timepoint->timepoint.device_wait = event; - } + iree_hal_cuda2_event_t* wait_event = NULL; + iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait( + semaphore, min_value, &wait_event); + if (iree_status_is_ok(status)) { + // We've found an existing signal timepoint to wait on; we don't need a + // standalone wait timepoint anymore. Decrease its refcount before + // overwriting it to return it back to the pool and retain the existing one. + iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait); + wait_timepoint->timepoint.device_wait = wait_event; } - iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex); *out_event = iree_hal_cuda2_event_handle(wait_timepoint->timepoint.device_wait); diff --git a/experimental/cuda2/event_semaphore.h b/experimental/cuda2/event_semaphore.h index 1ec09ed60407..ec49704bd96c 100644 --- a/experimental/cuda2/event_semaphore.h +++ b/experimental/cuda2/event_semaphore.h @@ -43,7 +43,7 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_signal( CUevent* out_event); // Acquires a timepoint to wait the timeline to reach at least the given -// |min_value| on the device The underlying CUDA event is written into +// |min_value| on the device. The underlying CUDA event is written into // |out_event| for interacting with CUDA APIs. iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait( iree_hal_semaphore_t* base_semaphore, uint64_t min_value,