From c4e01e95f2e14612c76098e50c2d724af509f99f Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Fri, 9 Jun 2023 17:38:05 -0400
Subject: [PATCH] [cuda] Wire up basic creating devices, allocators, and
 buffers (#14011)

This commit ports over existing CUDA HAL driver code to enable
creating devices, allocators, and buffers. It's mostly NFC,
with just symbol renaming and turning various functionalities
into unimplemented.

With this commit, we are able to pick up CTS tests.

Progress towards https://github.com/openxla/iree/issues/13245
---
 experimental/cuda2/CMakeLists.txt             |   5 +
 experimental/cuda2/api.h                      |  29 +-
 experimental/cuda2/cts/CMakeLists.txt         |  24 +
 experimental/cuda2/cuda_device.c              | 481 ++++++++++++++++++
 experimental/cuda2/cuda_device.h              |  48 ++
 experimental/cuda2/cuda_driver.c              |  23 +-
 experimental/cuda2/memory_pools.c             |   2 +-
 experimental/cuda2/memory_pools.h             |   2 +-
 .../cuda2/registration/driver_module.c        |  10 +-
 9 files changed, 615 insertions(+), 9 deletions(-)
 create mode 100644 experimental/cuda2/cts/CMakeLists.txt
 create mode 100644 experimental/cuda2/cuda_device.c
 create mode 100644 experimental/cuda2/cuda_device.h

diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index ca1959516917..d06868660ac4 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -21,15 +21,20 @@ iree_cc_library(
     "cuda_allocator.h"
     "cuda_buffer.c"
     "cuda_buffer.h"
+    "cuda_device.c"
+    "cuda_device.h"
     "cuda_driver.c"
     "memory_pools.c"
     "memory_pools.h"
   DEPS
     ::dynamic_symbols
     iree::base
+    iree::base::internal
+    iree::base::internal::arena
     iree::base::core_headers
     iree::base::tracing
     iree::hal
+    iree::hal::utils::buffer_transfer
     iree::schemas::cuda_executable_def_c_fbs
   PUBLIC
 )
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index a56b656b2b7c..777d6ef238d1 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -17,7 +17,7 @@ extern "C" {
 #endif  // __cplusplus
 
 //===----------------------------------------------------------------------===//
-// iree_hal_cuda_device_t
+// iree_hal_cuda2_device_t
 //===----------------------------------------------------------------------===//
 
 // Parameters defining a CUmemoryPool.
@@ -40,6 +40,32 @@ typedef struct iree_hal_cuda2_memory_pooling_params_t {
   iree_hal_cuda2_memory_pool_params_t other;
 } iree_hal_cuda2_memory_pooling_params_t;
 
+// Parameters configuring an iree_hal_cuda2_device_t.
+// Must be initialized with iree_hal_cuda2_device_params_initialize prior to
+// use.
+typedef struct iree_hal_cuda2_device_params_t {
+  // Number of queues exposed on the device.
+  // Each queue acts as a separate synchronization scope where all work executes
+  // concurrently unless prohibited by semaphores.
+  iree_host_size_t queue_count;
+
+  // Total size of each block in the device shared block pool.
+  // Larger sizes will lower overhead and ensure the heap isn't hit for
+  // transient allocations while also increasing memory consumption.
+  iree_host_size_t arena_block_size;
+
+  // Whether to use async allocations even if reported as available by the
+  // device. Defaults to true when the device supports it.
+  bool async_allocations;
+
+  // Parameters for each CUmemoryPool used for queue-ordered allocations.
+  iree_hal_cuda2_memory_pooling_params_t memory_pools;
+} iree_hal_cuda2_device_params_t;
+
+// Initializes |out_params| to default values.
+IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
+    iree_hal_cuda2_device_params_t* out_params);
+
 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_driver_t
 //===----------------------------------------------------------------------===//
@@ -62,6 +88,7 @@ IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
 IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
     iree_string_view_t identifier,
     const iree_hal_cuda2_driver_options_t* options,
+    const iree_hal_cuda2_device_params_t* default_params,
     iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
 
 #ifdef __cplusplus
diff --git a/experimental/cuda2/cts/CMakeLists.txt b/experimental/cuda2/cts/CMakeLists.txt
new file mode 100644
index 000000000000..7c04876827ee
--- /dev/null
+++ b/experimental/cuda2/cts/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    cuda2
+  DRIVER_REGISTRATION_HDR
+    "experimental/cuda2/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_cuda2_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "cuda"
+  EXECUTABLE_FORMAT
+    "\"PTXE\""
+  DEPS
+    iree::experimental::cuda2::registration
+  INCLUDED_TESTS
+    "allocator"
+    "buffer_mapping"
+    "driver"
+)
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
new file mode 100644
index 000000000000..9ea9ddce0e3f
--- /dev/null
+++ b/experimental/cuda2/cuda_device.c
@@ -0,0 +1,481 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/cuda_device.h"
+
+#include <iree/hal/device.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "experimental/cuda2/cuda_allocator.h"
+#include "experimental/cuda2/cuda_buffer.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+#include "experimental/cuda2/memory_pools.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/buffer_transfer.h"
+#include "iree/hal/utils/deferred_command_buffer.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda2_device_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cuda2_device_t {
+  // Abstract resource used for injecting reference counting and vtable;
+  // must be at offset 0.
+  iree_hal_resource_t resource;
+  iree_string_view_t identifier;
+
+  // Block pool used for command buffers with a larger block size (as command
+  // buffers can contain inlined data uploads).
+  iree_arena_block_pool_t block_pool;
+
+  // Optional driver that owns the CUDA symbols. We retain it for our lifetime
+  // to ensure the symbols remains valid.
+  iree_hal_driver_t* driver;
+
+  const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols;
+
+  // Parameters used to control device behavior.
+  iree_hal_cuda2_device_params_t params;
+
+  CUcontext cu_context;
+  CUdevice cu_device;
+  // TODO: support multiple streams.
+  CUstream cu_stream;
+
+  iree_allocator_t host_allocator;
+
+  // Device memory pools and allocators.
+  bool supports_memory_pools;
+  iree_hal_cuda2_memory_pools_t memory_pools;
+  iree_hal_allocator_t* device_allocator;
+} iree_hal_cuda2_device_t;
+
+static const iree_hal_device_vtable_t iree_hal_cuda2_device_vtable;
+
+static iree_hal_cuda2_device_t* iree_hal_cuda2_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_device_vtable);
+  return (iree_hal_cuda2_device_t*)base_value;
+}
+
+static iree_hal_cuda2_device_t* iree_hal_cuda2_device_cast_unsafe(
+    iree_hal_device_t* base_value) {
+  return (iree_hal_cuda2_device_t*)base_value;
+}
+
+IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
+    iree_hal_cuda2_device_params_t* out_params) {
+  memset(out_params, 0, sizeof(*out_params));
+  out_params->arena_block_size = 32 * 1024;
+  out_params->queue_count = 1;
+  out_params->async_allocations = true;
+}
+
+static iree_status_t iree_hal_cuda2_device_check_params(
+    const iree_hal_cuda2_device_params_t* params) {
+  if (params->arena_block_size < 4096) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "arena block size too small (< 4096 bytes)");
+  }
+  if (params->queue_count == 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "at least one queue is required");
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_device_create_internal(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda2_device_params_t* params, CUdevice cu_device,
+    CUstream stream, CUcontext context,
+    const iree_hal_cuda2_dynamic_symbols_t* symbols,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_cuda2_device_t* device = NULL;
+  iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device));
+  memset(device, 0, total_size);
+
+  iree_hal_resource_initialize(&iree_hal_cuda2_device_vtable,
+                               &device->resource);
+  iree_string_view_append_to_buffer(
+      identifier, &device->identifier,
+      (char*)device + iree_sizeof_struct(*device));
+  iree_arena_block_pool_initialize(params->arena_block_size, host_allocator,
+                                   &device->block_pool);
+  device->driver = driver;
+  iree_hal_driver_retain(device->driver);
+  device->cuda_symbols = symbols;
+  device->params = *params;
+  device->cu_context = context;
+  device->cu_device = cu_device;
+  device->cu_stream = stream;
+  device->host_allocator = host_allocator;
+
+  iree_status_t status = iree_ok_status();
+
+  // Memory pool support is conditional.
+  if (iree_status_is_ok(status) && params->async_allocations) {
+    int supports_memory_pools = 0;
+    status = IREE_CURESULT_TO_STATUS(
+        symbols,
+        cuDeviceGetAttribute(&supports_memory_pools,
+                             CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
+                             cu_device),
+        "cuDeviceGetAttribute");
+    device->supports_memory_pools = supports_memory_pools != 0;
+  }
+
+  // Create memory pools first so that we can share them with the allocator.
+  if (iree_status_is_ok(status) && device->supports_memory_pools) {
+    status = iree_hal_cuda2_memory_pools_initialize(
+        symbols, cu_device, &params->memory_pools, host_allocator,
+        &device->memory_pools);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_allocator_create(
+        (iree_hal_device_t*)device, symbols, cu_device, stream,
+        device->supports_memory_pools ? &device->memory_pools : NULL,
+        host_allocator, &device->device_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_release((iree_hal_device_t*)device);
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda2_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda2_device_params_t* params,
+    const iree_hal_cuda2_dynamic_symbols_t* symbols, CUdevice device,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(driver);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(symbols);
+  IREE_ASSERT_ARGUMENT(out_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_hal_cuda2_device_check_params(params);
+
+  // Get the main context for the device.
+  CUcontext context = NULL;
+  if (iree_status_is_ok(status)) {
+    status = IREE_CURESULT_TO_STATUS(
+        symbols, cuDevicePrimaryCtxRetain(&context, device));
+  }
+  if (iree_status_is_ok(status)) {
+    status = IREE_CURESULT_TO_STATUS(symbols, cuCtxSetCurrent(context));
+  }
+
+  // Create the default stream for the device.
+  CUstream stream = NULL;
+  if (iree_status_is_ok(status)) {
+    status = IREE_CURESULT_TO_STATUS(
+        symbols, cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_device_create_internal(
+        driver, identifier, params, device, stream, context, symbols,
+        host_allocator, out_device);
+  }
+  if (!iree_status_is_ok(status)) {
+    if (stream) symbols->cuStreamDestroy(stream);
+    if (context) symbols->cuDevicePrimaryCtxRelease(device);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+CUcontext iree_hal_cuda2_device_context(iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device =
+      iree_hal_cuda2_device_cast_unsafe(base_device);
+  return device->cu_context;
+}
+
+const iree_hal_cuda2_dynamic_symbols_t* iree_hal_cuda2_device_dynamic_symbols(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device =
+      iree_hal_cuda2_device_cast_unsafe(base_device);
+  return device->cuda_symbols;
+}
+
+static void iree_hal_cuda2_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // There should be no more buffers live that use the allocator.
+  iree_hal_allocator_release(device->device_allocator);
+
+  // Destroy memory pools that hold on to reserved memory.
+  iree_hal_cuda2_memory_pools_deinitialize(&device->memory_pools);
+
+  // TODO: support multiple streams.
+  IREE_CUDA_IGNORE_ERROR(device->cuda_symbols,
+                         cuStreamDestroy(device->cu_stream));
+
+  IREE_CUDA_IGNORE_ERROR(device->cuda_symbols,
+                         cuDevicePrimaryCtxRelease(device->cu_device));
+
+  iree_arena_block_pool_deinitialize(&device->block_pool);
+
+  // Finally, destroy the device.
+  iree_hal_driver_release(device->driver);
+
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_hal_cuda2_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_cuda2_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  return device->host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_cuda2_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  return device->device_allocator;
+}
+
+static void iree_hal_cuda2_replace_device_allocator(
+    iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  iree_hal_allocator_retain(new_allocator);
+  iree_hal_allocator_release(device->device_allocator);
+  device->device_allocator = new_allocator;
+}
+
+static void iree_hal_cuda2_replace_channel_provider(
+    iree_hal_device_t* base_device, iree_hal_channel_provider_t* new_provider) {
+  // TODO: implement this together with channel support.
+}
+
+static iree_status_t iree_hal_cuda2_device_trim(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  iree_arena_block_pool_trim(&device->block_pool);
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_trim(device->device_allocator));
+  if (device->supports_memory_pools) {
+    IREE_RETURN_IF_ERROR(iree_hal_cuda2_memory_pools_trim(
+        &device->memory_pools, &device->params.memory_pools));
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_device_query_attribute(
+    iree_hal_cuda2_device_t* device, CUdevice_attribute attribute,
+    int64_t* out_value) {
+  int value = 0;
+  IREE_CUDA_RETURN_IF_ERROR(
+      device->cuda_symbols,
+      cuDeviceGetAttribute(&value, attribute, device->cu_device),
+      "cuDeviceGetAttribute");
+  *out_value = value;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_device_query_i64(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int64_t* out_value) {
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) {
+    *out_value = iree_string_view_equal(key, IREE_SV("cuda-nvptx-fb")) ? 1 : 0;
+    return iree_ok_status();
+  }
+
+  if (iree_string_view_equal(category, IREE_SV("cuda.device"))) {
+    if (iree_string_view_equal(key, IREE_SV("compute_capability_major"))) {
+      return iree_hal_cuda2_device_query_attribute(
+          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, out_value);
+    } else if (iree_string_view_equal(key,
+                                      IREE_SV("compute_capability_minor"))) {
+      return iree_hal_cuda2_device_query_attribute(
+          device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, out_value);
+    }
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+static iree_status_t iree_hal_cuda2_device_create_channel(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_channel_params_t params, iree_hal_channel_t** out_channel) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "channel not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "command buffer not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_flags_t flags,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "descriptor set layout not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "event not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "executable cache not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_pipeline_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t* const* set_layouts,
+    iree_hal_pipeline_layout_t** out_pipeline_layout) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "pipeline layout not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "semaphore not yet implmeneted");
+}
+
+static iree_hal_semaphore_compatibility_t
+iree_hal_cuda2_device_query_semaphore_compatibility(
+    iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
+  // TODO: implement CUDA semaphores.
+  return IREE_HAL_SEMAPHORE_COMPATIBILITY_NONE;
+}
+
+// TODO: implement multiple streams; today we only have one and queue_affinity
+//       is ignored.
+// TODO: implement proper semaphores in CUDA to ensure ordering and avoid
+//       the barrier here.
+static iree_status_t iree_hal_cuda2_device_queue_alloca(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "queue alloca not yet implmeneted");
+}
+
+// TODO: implement multiple streams; today we only have one and queue_affinity
+//       is ignored.
+// TODO: implement proper semaphores in CUDA to ensure ordering and avoid
+//       the barrier here.
+static iree_status_t iree_hal_cuda2_device_queue_dealloca(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* buffer) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "queue dealloca not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_queue_execute(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_host_size_t command_buffer_count,
+    iree_hal_command_buffer_t* const* command_buffers) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "queue execution not yet implmeneted");
+}
+
+static iree_status_t iree_hal_cuda2_device_queue_flush(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) {
+  // Currently unused; we flush as submissions are made.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "semaphore not yet implemented");
+}
+
+static iree_status_t iree_hal_cuda2_device_profiling_begin(
+    iree_hal_device_t* base_device,
+    const iree_hal_device_profiling_options_t* options) {
+  // Unimplemented (and that's ok).
+  // We could hook in to CUPTI here or use the much simpler cuProfilerStart API.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_device_profiling_end(
+    iree_hal_device_t* base_device) {
+  // Unimplemented (and that's ok).
+  return iree_ok_status();
+}
+
+static const iree_hal_device_vtable_t iree_hal_cuda2_device_vtable = {
+    .destroy = iree_hal_cuda2_device_destroy,
+    .id = iree_hal_cuda2_device_id,
+    .host_allocator = iree_hal_cuda2_device_host_allocator,
+    .device_allocator = iree_hal_cuda2_device_allocator,
+    .replace_device_allocator = iree_hal_cuda2_replace_device_allocator,
+    .replace_channel_provider = iree_hal_cuda2_replace_channel_provider,
+    .trim = iree_hal_cuda2_device_trim,
+    .query_i64 = iree_hal_cuda2_device_query_i64,
+    .create_channel = iree_hal_cuda2_device_create_channel,
+    .create_command_buffer = iree_hal_cuda2_device_create_command_buffer,
+    .create_descriptor_set_layout =
+        iree_hal_cuda2_device_create_descriptor_set_layout,
+    .create_event = iree_hal_cuda2_device_create_event,
+    .create_executable_cache = iree_hal_cuda2_device_create_executable_cache,
+    .create_pipeline_layout = iree_hal_cuda2_device_create_pipeline_layout,
+    .create_semaphore = iree_hal_cuda2_device_create_semaphore,
+    .query_semaphore_compatibility =
+        iree_hal_cuda2_device_query_semaphore_compatibility,
+    .transfer_range = iree_hal_device_submit_transfer_range_and_wait,
+    .queue_alloca = iree_hal_cuda2_device_queue_alloca,
+    .queue_dealloca = iree_hal_cuda2_device_queue_dealloca,
+    .queue_execute = iree_hal_cuda2_device_queue_execute,
+    .queue_flush = iree_hal_cuda2_device_queue_flush,
+    .wait_semaphores = iree_hal_cuda2_device_wait_semaphores,
+    .profiling_begin = iree_hal_cuda2_device_profiling_begin,
+    .profiling_end = iree_hal_cuda2_device_profiling_end,
+};
diff --git a/experimental/cuda2/cuda_device.h b/experimental/cuda2/cuda_device.h
new file mode 100644
index 000000000000..70dbd3694cd4
--- /dev/null
+++ b/experimental/cuda2/cuda_device.h
@@ -0,0 +1,48 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_CUDA_DEVICE_H_
+#define EXPERIMENTAL_CUDA2_CUDA_DEVICE_H_
+
+#include "experimental/cuda2/api.h"
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a device that owns and manages its own CUcontext.
+iree_status_t iree_hal_cuda2_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda2_device_params_t* params,
+    const iree_hal_cuda2_dynamic_symbols_t* symbols, CUdevice device,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device);
+
+// Returns the CUDA context bound to the given |device| if it is a CUDA device
+// and otherwise returns NULL.
+//
+// WARNING: this API is unsafe and unstable. HAL devices may have any number of
+// contexts and the context may be in use on other threads.
+CUcontext iree_hal_cuda2_device_context(iree_hal_device_t* device);
+
+// Returns the dynamic symbol table from the |device| if it is a CUDA device
+// and otherwise returns NULL.
+//
+// WARNING: the symbols are only valid for as long as the device is. Hosting
+// libraries and applications should prefer to either link against CUDA
+// themselves or maintain their own dynamic linking support: the IREE runtime
+// only provides the symbols required by the HAL driver and not the entirety of
+// the API.
+const iree_hal_cuda2_dynamic_symbols_t* iree_hal_cuda2_device_dynamic_symbols(
+    iree_hal_device_t* device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_CUDA_DEVICE_H_
diff --git a/experimental/cuda2/cuda_driver.c b/experimental/cuda2/cuda_driver.c
index 8d1590f1f403..8fee8b9af881 100644
--- a/experimental/cuda2/cuda_driver.c
+++ b/experimental/cuda2/cuda_driver.c
@@ -8,6 +8,7 @@
 #include <string.h>
 
 #include "experimental/cuda2/api.h"
+#include "experimental/cuda2/cuda_device.h"
 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "experimental/cuda2/cuda_status_util.h"
 #include "experimental/cuda2/nccl_dynamic_symbols.h"
@@ -38,6 +39,9 @@ typedef struct iree_hal_cuda2_driver_t {
   // NCCL API dynamic symbols to interact with the CUDA system.
   iree_hal_cuda2_nccl_dynamic_symbols_t nccl_symbols;
 
+  // The default parameters for creating devices using this driver.
+  iree_hal_cuda2_device_params_t device_params;
+
   // The index of the default CUDA device to use if multiple ones are available.
   int default_device_index;
 } iree_hal_cuda2_driver_t;
@@ -60,6 +64,7 @@ IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
 static iree_status_t iree_hal_cuda2_driver_create_internal(
     iree_string_view_t identifier,
     const iree_hal_cuda2_driver_options_t* options,
+    const iree_hal_cuda2_device_params_t* device_params,
     iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
   iree_hal_cuda2_driver_t* driver = NULL;
   iree_host_size_t total_size = iree_sizeof_struct(*driver) + identifier.size;
@@ -86,6 +91,8 @@ static iree_status_t iree_hal_cuda2_driver_create_internal(
     if (iree_status_is_unavailable(status)) status = iree_status_ignore(status);
   }
 
+  memcpy(&driver->device_params, device_params, sizeof(driver->device_params));
+
   if (iree_status_is_ok(status)) {
     *out_driver = (iree_hal_driver_t*)driver;
   } else {
@@ -97,13 +104,15 @@ static iree_status_t iree_hal_cuda2_driver_create_internal(
 IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
     iree_string_view_t identifier,
     const iree_hal_cuda2_driver_options_t* options,
+    const iree_hal_cuda2_device_params_t* device_params,
     iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
   IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(device_params);
   IREE_ASSERT_ARGUMENT(out_driver);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_status_t status = iree_hal_cuda2_driver_create_internal(
-      identifier, options, host_allocator, out_driver);
+      identifier, options, device_params, host_allocator, out_driver);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -438,7 +447,6 @@ static iree_status_t iree_hal_cuda2_driver_create_device_by_id(
     iree_host_size_t param_count, const iree_string_pair_t* params,
     iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
   IREE_ASSERT_ARGUMENT(base_driver);
-  IREE_ASSERT_ARGUMENT(params);
   IREE_ASSERT_ARGUMENT(out_device);
 
   iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);
@@ -458,10 +466,16 @@ static iree_status_t iree_hal_cuda2_driver_create_device_by_id(
   } else {
     device = IREE_DEVICE_ID_TO_CUDEVICE(device_id);
   }
-  (void)device;
+
+  iree_string_view_t device_name = iree_make_cstring_view("cuda2");
+
+  // Attempt to create the device now.
+  iree_status_t status = iree_hal_cuda2_device_create(
+      base_driver, device_name, &driver->device_params, &driver->cuda_symbols,
+      device, host_allocator, out_device);
 
   IREE_TRACE_ZONE_END(z0);
-  return iree_status_from_code(IREE_STATUS_UNIMPLEMENTED);
+  return status;
 }
 
 static iree_status_t iree_hal_cuda2_driver_create_device_by_uuid(
@@ -560,7 +574,6 @@ static iree_status_t iree_hal_cuda2_driver_create_device_by_path(
     const iree_string_pair_t* params, iree_allocator_t host_allocator,
     iree_hal_device_t** out_device) {
   IREE_ASSERT_ARGUMENT(base_driver);
-  IREE_ASSERT_ARGUMENT(params);
   IREE_ASSERT_ARGUMENT(out_device);
 
   if (iree_string_view_is_empty(device_path)) {
diff --git a/experimental/cuda2/memory_pools.c b/experimental/cuda2/memory_pools.c
index b7b7baad9e59..c20c9ebe69e4 100644
--- a/experimental/cuda2/memory_pools.c
+++ b/experimental/cuda2/memory_pools.c
@@ -59,9 +59,9 @@ static iree_status_t iree_hal_cuda2_create_memory_pool(
 }
 
 iree_status_t iree_hal_cuda2_memory_pools_initialize(
-    iree_allocator_t host_allocator,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
     const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_allocator_t host_allocator,
     iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
   IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(pooling_params);
diff --git a/experimental/cuda2/memory_pools.h b/experimental/cuda2/memory_pools.h
index e88e05fd63d8..9c1e59b0c0d0 100644
--- a/experimental/cuda2/memory_pools.h
+++ b/experimental/cuda2/memory_pools.h
@@ -38,9 +38,9 @@ typedef struct iree_hal_cuda2_memory_pools_t {
 
 // Initializes |out_pools| by configuring new CUDA memory pools.
 iree_status_t iree_hal_cuda2_memory_pools_initialize(
-    iree_allocator_t host_allocator,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
     const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
+    iree_allocator_t host_allocator,
     iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools);
 
 // Deinitializes the |pools| and releases the underlying CUDA resources.
diff --git a/experimental/cuda2/registration/driver_module.c b/experimental/cuda2/registration/driver_module.c
index 4089ed0d1165..b805bd5ae323 100644
--- a/experimental/cuda2/registration/driver_module.c
+++ b/experimental/cuda2/registration/driver_module.c
@@ -15,6 +15,10 @@
 #include "iree/base/status.h"
 #include "iree/base/tracing.h"
 
+IREE_FLAG(
+    bool, cuda_async_allocations, true,
+    "Enables CUDA asynchronous stream-ordered allocations when supported.");
+
 IREE_FLAG(int32_t, cuda2_default_index, 0,
           "Specifies the index of the default CUDA device to use");
 
@@ -80,6 +84,10 @@ static iree_status_t iree_hal_cuda2_driver_factory_try_create(
   iree_hal_cuda2_driver_options_t driver_options;
   iree_hal_cuda2_driver_options_initialize(&driver_options);
 
+  iree_hal_cuda2_device_params_t device_params;
+  iree_hal_cuda2_device_params_initialize(&device_params);
+  device_params.async_allocations = FLAG_cuda_async_allocations;
+
   driver_options.default_device_index = FLAG_cuda2_default_index;
   if (FLAG_cuda2_default_index_from_mpi) {
     driver_options.default_device_index =
@@ -88,7 +96,7 @@ static iree_status_t iree_hal_cuda2_driver_factory_try_create(
   }
 
   iree_status_t status = iree_hal_cuda2_driver_create(
-      driver_name, &driver_options, host_allocator, out_driver);
+      driver_name, &driver_options, &device_params, host_allocator, out_driver);
 
   IREE_TRACE_ZONE_END(z0);