iree-org · antiagainst · Aug 31, 2023 · Aug 22, 2023 · Aug 31, 2023
@@ -107,7 +107,9 @@ instead:
 
 * We keep track of all GPU signals in the timeline. Once we see a GPU wait
   request, try to scan the timeline to find a GPU signal that advances the
-  timeline past the desired value, and use that for waiting instead.
+  timeline past the desired value, and use that for waiting instead. (This
+  actually applies to CPU waits too, and it's an optimization over pure
+  CPU side `iree_event_t` polling.)
 * We may not see GPU signal before seeing GPU wait requests, then we can also
   keep track of all GPU waits in the timeline. Later once see either a CPU
   signal or GPU signal advancing past the waited value, we can handle them

@@ -225,6 +225,37 @@ static iree_status_t iree_hal_cuda2_semaphore_acquire_timepoint_host_wait(
   return iree_ok_status();
 }
 
+// Acquires an iree_hal_cuda2_event_t object to wait on the host for the
+// timeline to reach at least the given |min_value| on the device.
+// Returns true and writes to |out_event| if we can find such an event;
+// returns false otherwise.
+// The caller should release the |out_event| once done.
+static bool iree_hal_cuda2_semaphore_acquire_event_host_wait(
+    iree_hal_cuda2_semaphore_t* semaphore, uint64_t min_value,
+    iree_hal_cuda2_event_t** out_event) {
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Scan through the timepoint list and try to find a device event signal to
+  // wait on. We need to lock with the timepoint list mutex here.
+  iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
+  for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
+       tp != NULL; tp = tp->next) {
+    iree_hal_cuda2_timepoint_t* signal_timepoint =
+        (iree_hal_cuda2_timepoint_t*)tp;
+    if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
+        signal_timepoint->base.minimum_value >= min_value) {
+      *out_event = signal_timepoint->timepoint.device_signal;
+      iree_hal_cuda2_event_retain(*out_event);
+      break;
+    }
+  }
+  iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);
+
+  IREE_TRACE_ZONE_END(z0);
+  return *out_event != NULL;
+}
+
 static iree_status_t iree_hal_cuda2_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
     iree_timeout_t timeout) {
@@ -255,6 +286,21 @@ static iree_status_t iree_hal_cuda2_semaphore_wait(
 
   iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
 
+  // Slow path: try to see if we can have a device CUevent to wait on. This
+  // should happen outside of the lock given that acquiring has its own internal
+  // locks. This is faster than waiting on a host timepoint.
+  iree_hal_cuda2_event_t* wait_event = NULL;
+  if (iree_hal_cuda2_semaphore_acquire_event_host_wait(semaphore, value,
+                                                       &wait_event)) {
+    IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, semaphore->symbols,
+        cuEventSynchronize(iree_hal_cuda2_event_handle(wait_event)),
+        "cuEventSynchronize");
+    iree_hal_cuda2_event_release(wait_event);
+    IREE_TRACE_ZONE_END(z0);
+    return iree_ok_status();
+  }
+
   // Slow path: acquire a timepoint. This should happen outside of the lock to
   // given that acquiring has its own internal locks.
   iree_hal_cuda2_timepoint_t* timepoint = NULL;
@@ -380,25 +426,15 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
       },
       &wait_timepoint->base);
 
-  // Scan through the timepoint list and try to find a device event signal to
-  // wait on. We need to lock with the timepoint list mutex here.
-  iree_slim_mutex_lock(&semaphore->base.timepoint_mutex);
-  for (iree_hal_semaphore_timepoint_t* tp = semaphore->base.timepoint_list.head;
-       tp != NULL; tp = tp->next) {
-    iree_hal_cuda2_timepoint_t* signal_timepoint =
-        (iree_hal_cuda2_timepoint_t*)tp;
-    if (signal_timepoint->kind == IREE_HAL_CUDA_TIMEPOINT_KIND_DEVICE_SIGNAL &&
-        signal_timepoint->base.minimum_value >= min_value) {
-      // We've found an existing signal timepoint to wait on; we don't need a
-      // standalone wait timepoint anymore. Decrease its refcount before
-      // overwriting it to return it back to the pool and retain the new one.
-      iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
-      iree_hal_cuda2_event_t* event = signal_timepoint->timepoint.device_signal;
-      iree_hal_cuda2_event_retain(event);
-      wait_timepoint->timepoint.device_wait = event;
-    }
+  iree_hal_cuda2_event_t* wait_event = NULL;
+  if (iree_hal_cuda2_semaphore_acquire_event_host_wait(semaphore, min_value,
+                                                       &wait_event)) {
+    // We've found an existing signal timepoint to wait on; we don't need a
+    // standalone wait timepoint anymore. Decrease its refcount before
+    // overwriting it to return it back to the pool and retain the existing one.
+    iree_hal_cuda2_event_release(wait_timepoint->timepoint.device_wait);
+    wait_timepoint->timepoint.device_wait = wait_event;
   }
-  iree_slim_mutex_unlock(&semaphore->base.timepoint_mutex);
 
   *out_event =
       iree_hal_cuda2_event_handle(wait_timepoint->timepoint.device_wait);

@@ -43,7 +43,7 @@ iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_signal(
     CUevent* out_event);
 
 // Acquires a timepoint to wait the timeline to reach at least the given
-// |min_value| on the device The underlying CUDA event is written into
+// |min_value| on the device. The underlying CUDA event is written into
 // |out_event| for interacting with CUDA APIs.
 iree_status_t iree_hal_cuda2_event_semaphore_acquire_timepoint_device_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t min_value,