Merge branch 'branch-24.12' into improve_nvtx

rapidsai · Oct 31, 2024 · 48785d3 · 48785d3
2 parents 4d01c21 + b65eda3
commit 48785d3
Show file tree

Hide file tree

Showing 15 changed files with 166 additions and 24 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -92,7 +92,7 @@ repos:
               meta[.]yaml$
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
@@ -16,6 +16,10 @@ rapids-print-env
 rapids-logger "Begin cpp build"
 conda config --set path_conflict prevent
 
+sccache --zero-stats
+
 rapids-conda-retry mambabuild conda/recipes/libkvikio
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
@@ -18,8 +18,12 @@ rapids-logger "Begin py build"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 conda config --set path_conflict prevent
 
+sccache --zero-stats
+
 rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/kvikio
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
@@ -13,8 +13,12 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
+sccache --zero-stats
+
 python -m pip install wheel
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
@@ -23,14 +23,18 @@ cd "${package_dir}"
 # are used when creating the isolated build environment
 echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/libkvikio_*.whl)" > ./constraints.txt
 
+sccache --zero-stats
+
 PIP_CONSTRAINT="${PWD}/constraints.txt" \
 SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" \
-    python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+    python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair \
     --exclude libnvcomp.so.4 \
     -w final_dist \
     dist/*
 
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest

diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest

diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
@@ -20,4 +20,4 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nvcomp_version:
-  - "=4.0.1"
+  - "=4.1.0.6"
diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
@@ -36,6 +36,7 @@ class AllocRetain {
   // The size of each allocation in `_free_allocs`
   std::size_t _size{defaults::bounce_buffer_size()};
 
+ public:
   /**
    * @brief An host memory allocation
    */
@@ -56,6 +57,7 @@ class AllocRetain {
     Alloc& operator=(Alloc&& o)    = delete;
     ~Alloc() noexcept { _manager->put(_alloc, _size); }
     void* get() noexcept { return _alloc; }
+    void* get(std::ptrdiff_t offset) noexcept { return static_cast<char*>(_alloc) + offset; }
     std::size_t size() noexcept { return _size; }
   };
 
@@ -67,6 +69,7 @@ class AllocRetain {
   // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
   ~AllocRetain() noexcept = default;
 
+ private:
   /**
    * @brief Free all retained allocations
    *

diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#include <cassert>
 #include <cstddef>
 #include <cstring>
+#include <iostream>
 #include <memory>
 #include <optional>
 #include <regex>
@@ -34,6 +36,98 @@
 namespace kvikio {
 namespace detail {
 
+/**
+ * @brief Bounce buffer in pinned host memory.
+ *
+ * @note Is not thread-safe.
+ */
+class BounceBufferH2D {
+  CUstream _stream;                 // The CUDA steam to use.
+  CUdeviceptr _dev;                 // The output device buffer.
+  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
+  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
+  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
+
+ public:
+  /**
+   * @brief Create a bounce buffer for an output device buffer.
+   *
+   * @param stream The CUDA stream used throughout the lifetime of the bounce buffer.
+   * @param device_buffer The output device buffer (final destination of the data).
+   */
+  BounceBufferH2D(CUstream stream, void* device_buffer)
+    : _stream{stream},
+      _dev{convert_void2deviceptr(device_buffer)},
+      _host_buffer{AllocRetain::instance().get()}
+  {
+  }
+
+  /**
+   * @brief The bounce buffer if flushed to device on destruction.
+   */
+  ~BounceBufferH2D() noexcept
+  {
+    try {
+      flush();
+    } catch (CUfileException const& e) {
+      std::cerr << "BounceBufferH2D error on final flush: ";
+      std::cerr << e.what();
+      std::cerr << std::endl;
+    }
+  }
+
+ private:
+  /**
+   * @brief Write host memory to the output device buffer.
+   *
+   * @param src The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write_to_device(void const* src, std::size_t size)
+  {
+    if (size > 0) {
+      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream));
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
+      _dev_offset += size;
+    }
+  }
+
+  /**
+   * @brief Flush the bounce buffer by writing everything to the output device buffer.
+   */
+  void flush()
+  {
+    write_to_device(_host_buffer.get(), _host_offset);
+    _host_offset = 0;
+  }
+
+ public:
+  /**
+   * @brief Write host memory to the bounce buffer (also host memory).
+   *
+   * Only when the bounce buffer has been filled up is data copied to the output device buffer.
+   *
+   * @param data The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write(char const* data, std::size_t size)
+  {
+    if (_host_buffer.size() - _host_offset < size) {  // Not enough space left in the bounce buffer
+      flush();
+      assert(_host_offset == 0);
+    }
+    if (_host_buffer.size() < size) {
+      // If still not enough space, we just copy the data to the device. This only happens when
+      // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance
+      // optimize for this case.
+      write_to_device(data, size);
+    } else if (size > 0) {
+      std::memcpy(_host_buffer.get(_host_offset), data, size);
+      _host_offset += size;
+    }
+  }
+};
+
 /**
  * @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks.
  */
@@ -46,6 +140,7 @@ struct CallbackContext {
     : buf{static_cast<char*>(buf)}, size{size}, offset{0}, overflow_error{0}
   {
   }
+  BounceBufferH2D* bounce_buffer{nullptr};  // Only used by callback_device_memory
 };
 
 /**
@@ -98,12 +193,7 @@ inline std::size_t callback_device_memory(char* data,
   }
   KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_device_memory()", nbytes);
 
-  CUstream stream = detail::StreamsByThread::get();
-  CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
-    convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream));
-  // We have to sync since curl might overwrite or free `data`.
-  CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-
+  ctx->bounce_buffer->write(data, nbytes);
   ctx->offset += nbytes;
   return nbytes;
 }
@@ -414,6 +504,10 @@ class RemoteHandle {
   /**
    * @brief Read from remote source into buffer (host or device memory).
    *
+   * When reading into device memory, a bounce buffer is used to avoid many small memory
+   * copies to device. Use `kvikio::default::bounce_buffer_size_reset()` to set the size
+   * of this bounce buffer (default 16 MiB).
+   *
    * @param buf Pointer to host or device memory.
    * @param size Number of bytes to read.
    * @param file_offset File offset in bytes.
@@ -450,6 +544,10 @@ class RemoteHandle {
         curl.perform();
       } else {
         PushAndPopContext c(get_context_from_pointer(buf));
+        // We use a bounce buffer to avoid many small memory copies to device. Libcurl has a
+        // maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller.
+        detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf);
+        ctx.bounce_buffer = &bounce_buffer;
         curl.perform();
       }
     } catch (std::runtime_error const& e) {

diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp
@@ -150,7 +150,7 @@ class CurlHandle {
   /**
    * @brief Construct a new curl handle.
    *
-   * Typically, do not use this directly instead use the `create_curl_handle()` macro.
+   * Typically, do not call this directly instead use the `create_curl_handle()` macro.
    *
    * @param handle An unused curl easy handle pointer, which is retained on destruction.
    * @param source_file Path of source file of the caller (for error messages).
@@ -166,6 +166,7 @@ class CurlHandle {
     setopt(CURLOPT_NOSIGNAL, 1L);
 
     // We always set CURLOPT_ERRORBUFFER to get better error messages.
+    _errbuf[0] = 0;  // Set the error buffer as empty.
     setopt(CURLOPT_ERRORBUFFER, _errbuf);
 
     // Make curl_easy_perform() fail when receiving HTTP code errors.
@@ -216,7 +217,7 @@ class CurlHandle {
     // Perform the curl operation and check for errors.
     CURLcode err = curl_easy_perform(handle());
     if (err != CURLE_OK) {
-      std::string msg(_errbuf);
+      std::string msg(_errbuf);  // We can do this because we always initialize `_errbuf` as empty.
       std::stringstream ss;
       ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line;
       if (msg.empty()) {
@@ -249,12 +250,36 @@ class CurlHandle {
   }
 };
 
+namespace detail {
+/**
+ * @brief Fix Conda's manipulation of __FILE__.
+ *
+ * Conda manipulates the path information in its shared libraries[1] with the results that the
+ * C macro `__FILE__` might contain trailing `\0` chars. Normally, this isn't a problem because
+ * `__FILE__` is a `const char*` that are terminated by the first encounter of `\0`. However, when
+ * creating a `std::string` from a `char*`, the compiler might optimize the code such that the
+ * `std::string` is created from the full size of `__FILE__` including the trailing `\0` chars.
+ *
+ * The extra `\0` is problematic if `CurlHandle` later throws an exception to Cython since, while
+ * converting the exception to Python, Cython might truncate the error message.
+ *
+ * [1] <https://docs.conda.io/projects/conda-build/en/latest/resources/make-relocatable.html>
+ */
+__attribute__((noinline)) inline std::string fix_conda_file_path_hack(std::string filename)
+{
+  if (filename.data() != nullptr) { return std::string{filename.data()}; }
+  return std::string{};
+}
+}  // namespace detail
+
 /**
  * @brief Create a new curl handle.
  *
  * @returns A `kvikio::CurlHandle` instance ready to be used.
  */
-#define create_curl_handle() \
-  kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__))
+#define create_curl_handle()                                             \
+  kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(),           \
+                     kvikio::detail::fix_conda_file_path_hack(__FILE__), \
+                     KVIKIO_STRINGIFY(__LINE__))
 
 }  // namespace kvikio
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -267,21 +267,21 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - nvcomp==4.0.1
+          - nvcomp==4.1.0.6
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - nvidia-nvcomp-cu12==4.0.1
+              - nvidia-nvcomp-cu12==4.1.0.6
           - matrix:
               cuda: "11.*"
             packages:
-              - nvidia-nvcomp-cu11==4.0.1
+              - nvidia-nvcomp-cu11==4.1.0.6
           - matrix:
             packages:
-              - nvidia-nvcomp==4.0.1
+              - nvidia-nvcomp==4.1.0.6
   docs:
     common:
       - output_types: [conda, requirements]

diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
-    "nvidia-nvcomp==4.0.1",
+    "nvidia-nvcomp==4.1.0.6",
     "packaging",
     "zarr",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.