Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cuda] Optimize device signal to host wait synchronization #14876

Merged
merged 2 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion experimental/cuda2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ instead:

* We keep track of all GPU signals in the timeline. Once we see a GPU wait
request, try to scan the timeline to find a GPU signal that advances the
timeline past the desired value, and use that for waiting instead.
timeline past the desired value, and use that for waiting instead. (This
actually applies to CPU waits too, and it's an optimization over pure
CPU side `iree_event_t` polling.)
* We may not see GPU signal before seeing GPU wait requests, then we can also
keep track of all GPU waits in the timeline. Later once see either a CPU
signal or GPU signal advancing past the waited value, we can handle them
Expand Down
72 changes: 54 additions & 18 deletions experimental/cuda2/event_semaphore.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,37 @@ static iree_status_t iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
return iree_ok_status();
}

// Acquires an iree_hal_cuda2_event_t object to wait on the host for the
// timeline to reach at least the given |min_value| on the device.
// Returns true and writes to |out_event| if we can find such an event;
// returns false otherwise.
// The caller should release the |out_event| once done.
static bool iree_hal_cuda2_semaphore_acquire_event_host_wait(
iree_hal_cuda2_semaphore_t* semaphore, uint64_t min_value,
iree_hal_cuda2_event_t** out_event) {
*out_event = NULL;
IREE_TRACE_ZONE_BEGIN(z0);

// Scan through the timepoint list and try to find a device event signal to
// wait on. We need to lock with the timepoint list mutex here.
iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
tp != NULL; tp = tp->next) {
iree_hal_cuda2_timepoint_t* signal_timepoint =
(iree_hal_cuda2_timepoint_t*)tp;
if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
signal_timepoint->base.minimum_value >= min_value) {
*out_event = signal_timepoint->timepoint.device_signal;
iree_hal_cuda2_event_retain(*out_event);
break;
}
}
iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);

IREE_TRACE_ZONE_END(z0);
return *out_event != NULL;
}

static iree_status_t iree_hal_cuda2_semaphore_wait(
iree_hal_semaphore_t* base_semaphore, uint64_t value,
iree_timeout_t timeout) {
Expand Down Expand Up @@ -255,6 +286,21 @@ static iree_status_t iree_hal_cuda2_semaphore_wait(

iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);

// Slow path: try to see if we can have a device CUevent to wait on. This
// should happen outside of the lock given that acquiring has its own internal
// locks. This is faster than waiting on a host timepoint.
iree_hal_cuda2_event_t* wait_event = NULL;
if (iree_hal_cuda2_semaphore_acquire_event_host_wait(semaphore, value,
&wait_event)) {
IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
z0, semaphore->symbols,
cuEventSynchronize(iree_hal_cuda2_event_handle(wait_event)),
"cuEventSynchronize");
iree_hal_cuda2_event_release(wait_event);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}

// Slow path: acquire a timepoint. This should happen outside of the lock to
// given that acquiring has its own internal locks.
iree_hal_cuda2_timepoint_t* timepoint = NULL;
Expand Down Expand Up @@ -380,25 +426,15 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
},
&wait_timepoint->base);

// Scan through the timepoint list and try to find a device event signal to
// wait on. We need to lock with the timepoint list mutex here.
iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
tp != NULL; tp = tp->next) {
iree_hal_cuda2_timepoint_t* signal_timepoint =
(iree_hal_cuda2_timepoint_t*)tp;
if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
signal_timepoint->base.minimum_value >= min_value) {
// We've found an existing signal timepoint to wait on; we don't need a
// standalone wait timepoint anymore. Decrease its refcount before
// overwriting it to return it back to the pool and retain the new one.
iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
iree_hal_cuda2_event_t* event = signal_timepoint->timepoint.device_signal;
iree_hal_cuda2_event_retain(event);
wait_timepoint->timepoint.device_wait = event;
}
iree_hal_cuda2_event_t* wait_event = NULL;
if (iree_hal_cuda2_semaphore_acquire_event_host_wait(semaphore, min_value,
&wait_event)) {
// We've found an existing signal timepoint to wait on; we don't need a
// standalone wait timepoint anymore. Decrease its refcount before
// overwriting it to return it back to the pool and retain the existing one.
iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
wait_timepoint->timepoint.device_wait = wait_event;
}
iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);

*out_event =
iree_hal_cuda2_event_handle(wait_timepoint->timepoint.device_wait);
Expand Down
2 changes: 1 addition & 1 deletion experimental/cuda2/event_semaphore.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_signal(
CUevent* out_event);

// Acquires a timepoint to wait the timeline to reach at least the given
// |min_value| on the device The underlying CUDA event is written into
// |min_value| on the device. The underlying CUDA event is written into
// |out_event| for interacting with CUDA APIs.
iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
iree_hal_semaphore_t* base_semaphore, uint64_t min_value,
Expand Down
Loading