Skip to content

Commit

Permalink
[cuda] Optimize device signal to host wait synchronization
Browse files Browse the repository at this point in the history
When we have a device `CUevent` signaling past a desired timepoint
waited by the host, we can actually wait on the `CUevent` using
`cuEventSynchronize`. This is profiled to be quite faster than
waiting on the full flow of device singal -> cuLaunchHostFunc
(async) -> host signal -> host polling.
  • Loading branch information
antiagainst committed Aug 30, 2023
1 parent 6e04816 commit b313a9b
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 21 deletions.
4 changes: 3 additions & 1 deletion experimental/cuda2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ instead:

* We keep track of all GPU signals in the timeline. Once we see a GPU wait
request, try to scan the timeline to find a GPU signal that advances the
timeline past the desired value, and use that for waiting instead.
timeline past the desired value, and use that for waiting instead. (This
actually applies to CPU waits too, and it's an optimization over pure
CPU side `iree_event_t` polling.)
* We may not see GPU signal before seeing GPU wait requests, then we can also
keep track of all GPU waits in the timeline. Later once see either a CPU
signal or GPU signal advancing past the waited value, we can handle them
Expand Down
76 changes: 57 additions & 19 deletions experimental/cuda2/event_semaphore.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,37 @@ static iree_status_t iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
return iree_ok_status();
}

// Acquires an iree_hal_cuda2_event_t object to wait on the host for the
// timeline to reach at least the given |min_value| on the device. Returns
// Returns IREE_STATUS_OK and writes to |out_event| if we can find such an
// event; returns IREE_STATUS_NOT_FOUND otherwise. The caller should release
// the |out_event| once done.
static iree_status_t iree_hal_cuda2_semaphore_acquire_event_host_wait(
iree_hal_cuda2_semaphore_t* semaphore, uint64_t min_value,
iree_hal_cuda2_event_t** out_event) {
*out_event = NULL;
IREE_TRACE_ZONE_BEGIN(z0);

// Scan through the timepoint list and try to find a device event signal to
// wait on. We need to lock with the timepoint list mutex here.
iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
tp != NULL; tp = tp->next) {
iree_hal_cuda2_timepoint_t* signal_timepoint =
(iree_hal_cuda2_timepoint_t*)tp;
if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
signal_timepoint->base.minimum_value >= min_value) {
*out_event = signal_timepoint->timepoint.device_signal;
iree_hal_cuda2_event_retain(*out_event);
break;
}
}
iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);

IREE_TRACE_ZONE_END(z0);
return iree_make_status(*out_event ? IREE_STATUS_OK : IREE_STATUS_NOT_FOUND);
}

static iree_status_t iree_hal_cuda2_semaphore_wait(
iree_hal_semaphore_t* base_semaphore, uint64_t value,
iree_timeout_t timeout) {
Expand Down Expand Up @@ -255,10 +286,26 @@ static iree_status_t iree_hal_cuda2_semaphore_wait(

iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);

// Slow path: try to see if we can have a device CUevent to wait on. This
// should happen outside of the lock given that acquiring has its own internal
// locks. This is faster than waiting on a host timepoint.
iree_hal_cuda2_event_t* wait_event = NULL;
iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait(
semaphore, value, &wait_event);
if (iree_status_is_ok(status)) {
IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
z0, semaphore->symbols,
cuEventSynchronize(iree_hal_cuda2_event_handle(wait_event)),
"cuEventSynchronize");
iree_hal_cuda2_event_release(wait_event);
IREE_TRACE_ZONE_END(z0);
return status;
}

// Slow path: acquire a timepoint. This should happen outside of the lock to
// given that acquiring has its own internal locks.
iree_hal_cuda2_timepoint_t* timepoint = NULL;
iree_status_t status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
semaphore, value, timeout, &timepoint);
if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
IREE_TRACE_ZONE_END(z0);
Expand Down Expand Up @@ -380,25 +427,16 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
},
&wait_timepoint->base);

// Scan through the timepoint list and try to find a device event signal to
// wait on. We need to lock with the timepoint list mutex here.
iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
tp != NULL; tp = tp->next) {
iree_hal_cuda2_timepoint_t* signal_timepoint =
(iree_hal_cuda2_timepoint_t*)tp;
if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
signal_timepoint->base.minimum_value >= min_value) {
// We've found an existing signal timepoint to wait on; we don't need a
// standalone wait timepoint anymore. Decrease its refcount before
// overwriting it to return it back to the pool and retain the new one.
iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
iree_hal_cuda2_event_t* event = signal_timepoint->timepoint.device_signal;
iree_hal_cuda2_event_retain(event);
wait_timepoint->timepoint.device_wait = event;
}
iree_hal_cuda2_event_t* wait_event = NULL;
iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait(
semaphore, min_value, &wait_event);
if (iree_status_is_ok(status)) {
// We've found an existing signal timepoint to wait on; we don't need a
// standalone wait timepoint anymore. Decrease its refcount before
// overwriting it to return it back to the pool and retain the existing one.
iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
wait_timepoint->timepoint.device_wait = wait_event;
}
iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);

*out_event =
iree_hal_cuda2_event_handle(wait_timepoint->timepoint.device_wait);
Expand Down
2 changes: 1 addition & 1 deletion experimental/cuda2/event_semaphore.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_signal(
CUevent* out_event);

// Acquires a timepoint to wait the timeline to reach at least the given
// |min_value| on the device The underlying CUDA event is written into
// |min_value| on the device. The underlying CUDA event is written into
// |out_event| for interacting with CUDA APIs.
iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
iree_hal_semaphore_t* base_semaphore, uint64_t min_value,
Expand Down

0 comments on commit b313a9b

Please sign in to comment.