From b313a9b2d2c1047bee3467a312192bcb71ce8cb9 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Tue, 22 Aug 2023 16:53:29 -0700
Subject: [PATCH] [cuda] Optimize device signal to host wait synchronization

When we have a device `CUevent` signaling past a desired timepoint
waited by the host, we can actually wait on the `CUevent` using
`cuEventSynchronize`. This is profiled to be quite faster than
waiting on the full flow of device singal -> cuLaunchHostFunc
(async) -> host signal -> host polling.
---
 experimental/cuda2/README.md         |  4 +-
 experimental/cuda2/event_semaphore.c | 76 +++++++++++++++++++++-------
 experimental/cuda2/event_semaphore.h |  2 +-
 3 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/experimental/cuda2/README.md b/experimental/cuda2/README.md
index 7825c755e736..1f2e8fbb0c89 100644
--- a/experimental/cuda2/README.md
+++ b/experimental/cuda2/README.md
@@ -107,7 +107,9 @@ instead:
 
 * We keep track of all GPU signals in the timeline. Once we see a GPU wait
   request, try to scan the timeline to find a GPU signal that advances the
-  timeline past the desired value, and use that for waiting instead.
+  timeline past the desired value, and use that for waiting instead. (This
+  actually applies to CPU waits too, and it's an optimization over pure
+  CPU side `iree_event_t` polling.)
 * We may not see GPU signal before seeing GPU wait requests, then we can also
   keep track of all GPU waits in the timeline. Later once see either a CPU
   signal or GPU signal advancing past the waited value, we can handle them
diff --git a/experimental/cuda2/event_semaphore.c b/experimental/cuda2/event_semaphore.c
index 0bddc8a83747..212bb3e13824 100644
--- a/experimental/cuda2/event_semaphore.c
+++ b/experimental/cuda2/event_semaphore.c
@@ -225,6 +225,37 @@ static iree_status_t iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
   return iree_ok_status();
 }
 
+// Acquires an iree_hal_cuda2_event_t object to wait on the host for the
+// timeline to reach at least the given |min_value| on the device. Returns
+// Returns IREE_STATUS_OK and writes to |out_event| if we can find such an
+// event; returns IREE_STATUS_NOT_FOUND otherwise. The caller should release
+// the |out_event| once done.
+static iree_status_t iree_hal_cuda2_semaphore_acquire_event_host_wait(
+    iree_hal_cuda2_semaphore_t* semaphore, uint64_t min_value,
+    iree_hal_cuda2_event_t** out_event) {
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Scan through the timepoint list and try to find a device event signal to
+  // wait on. We need to lock with the timepoint list mutex here.
+  iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
+  for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
+       tp != NULL; tp = tp->next) {
+    iree_hal_cuda2_timepoint_t* signal_timepoint =
+        (iree_hal_cuda2_timepoint_t*)tp;
+    if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
+        signal_timepoint->base.minimum_value >= min_value) {
+      *out_event = signal_timepoint->timepoint.device_signal;
+      iree_hal_cuda2_event_retain(*out_event);
+      break;
+    }
+  }
+  iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_make_status(*out_event ? IREE_STATUS_OK : IREE_STATUS_NOT_FOUND);
+}
+
 static iree_status_t iree_hal_cuda2_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
     iree_timeout_t timeout) {
@@ -255,10 +286,26 @@ static iree_status_t iree_hal_cuda2_semaphore_wait(
 
   iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
 
+  // Slow path: try to see if we can have a device CUevent to wait on. This
+  // should happen outside of the lock given that acquiring has its own internal
+  // locks. This is faster than waiting on a host timepoint.
+  iree_hal_cuda2_event_t* wait_event = NULL;
+  iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait(
+      semaphore, value, &wait_event);
+  if (iree_status_is_ok(status)) {
+    IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, semaphore->symbols,
+        cuEventSynchronize(iree_hal_cuda2_event_handle(wait_event)),
+        "cuEventSynchronize");
+    iree_hal_cuda2_event_release(wait_event);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
   // Slow path: acquire a timepoint. This should happen outside of the lock to
   // given that acquiring has its own internal locks.
   iree_hal_cuda2_timepoint_t* timepoint = NULL;
-  iree_status_t status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
+  status = iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
       semaphore, value, timeout, &timepoint);
   if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
     IREE_TRACE_ZONE_END(z0);
@@ -380,25 +427,16 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
       },
       &wait_timepoint->base);
 
-  // Scan through the timepoint list and try to find a device event signal to
-  // wait on. We need to lock with the timepoint list mutex here.
-  iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
-  for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
-       tp != NULL; tp = tp->next) {
-    iree_hal_cuda2_timepoint_t* signal_timepoint =
-        (iree_hal_cuda2_timepoint_t*)tp;
-    if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
-        signal_timepoint->base.minimum_value >= min_value) {
-      // We've found an existing signal timepoint to wait on; we don't need a
-      // standalone wait timepoint anymore. Decrease its refcount before
-      // overwriting it to return it back to the pool and retain the new one.
-      iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
-      iree_hal_cuda2_event_t* event = signal_timepoint->timepoint.device_signal;
-      iree_hal_cuda2_event_retain(event);
-      wait_timepoint->timepoint.device_wait = event;
-    }
+  iree_hal_cuda2_event_t* wait_event = NULL;
+  iree_status_t status = iree_hal_cuda2_semaphore_acquire_event_host_wait(
+      semaphore, min_value, &wait_event);
+  if (iree_status_is_ok(status)) {
+    // We've found an existing signal timepoint to wait on; we don't need a
+    // standalone wait timepoint anymore. Decrease its refcount before
+    // overwriting it to return it back to the pool and retain the existing one.
+    iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
+    wait_timepoint->timepoint.device_wait = wait_event;
   }
-  iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);
 
   *out_event =
       iree_hal_cuda2_event_handle(wait_timepoint->timepoint.device_wait);
diff --git a/experimental/cuda2/event_semaphore.h b/experimental/cuda2/event_semaphore.h
index 1ec09ed60407..ec49704bd96c 100644
--- a/experimental/cuda2/event_semaphore.h
+++ b/experimental/cuda2/event_semaphore.h
@@ -43,7 +43,7 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_signal(
     CUevent* out_event);
 
 // Acquires a timepoint to wait the timeline to reach at least the given
-// |min_value| on the device The underlying CUDA event is written into
+// |min_value| on the device. The underlying CUDA event is written into
 // |out_event| for interacting with CUDA APIs.
 iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t min_value,