diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d89e3ab71c..37d0178853 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -92,7 +92,7 @@ repos: meta[.]yaml$ - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 27ea30176d..8fb3a35991 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -16,6 +16,10 @@ rapids-print-env rapids-logger "Begin cpp build" conda config --set path_conflict prevent +sccache --zero-stats + rapids-conda-retry mambabuild conda/recipes/libkvikio +sccache --show-adv-stats + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 0b39b6c91f..7e0fc0bf93 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -18,8 +18,12 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) conda config --set path_conflict prevent +sccache --zero-stats + rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/kvikio +sccache --show-adv-stats + rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh index f576571568..b11cdf6677 100755 --- a/ci/build_wheel_cpp.sh +++ b/ci/build_wheel_cpp.sh @@ -13,8 +13,12 @@ rapids-generate-version > ./VERSION cd "${package_dir}" +sccache --zero-stats + python -m pip install wheel -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check + +sccache --show-adv-stats RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh index 65ac10e5a3..44cb76586c 100755 --- a/ci/build_wheel_python.sh +++ b/ci/build_wheel_python.sh @@ -23,9 +23,13 @@ cd "${package_dir}" # are used when creating the isolated build environment echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/libkvikio_*.whl)" > ./constraints.txt +sccache --zero-stats + PIP_CONSTRAINT="${PWD}/constraints.txt" \ SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" \ - python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check + python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check + +sccache --show-adv-stats mkdir -p final_dist python -m auditwheel repair \ @@ -33,4 +37,4 @@ python -m auditwheel repair \ -w final_dist \ dist/* -RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index ef1215d51b..54cbb12072 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-aarch64=11.8 -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - packaging - pre-commit - pytest diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 842b984cc6..ff3da55951 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -27,7 +27,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - packaging - pre-commit - pytest diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 9a4b3e94bd..41f6e6b3a3 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 - numpydoc -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - packaging - pre-commit - pytest diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 2b926acf29..cab86c6fdc 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -25,7 +25,7 @@ dependencies: - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 - numpydoc -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - packaging - pre-commit - pytest diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml index 35cb51e2f9..8af5f6f9c2 100644 --- a/conda/recipes/kvikio/conda_build_config.yaml +++ b/conda/recipes/kvikio/conda_build_config.yaml @@ -20,4 +20,4 @@ cmake_version: - ">=3.26.4,!=3.30.0" nvcomp_version: - - "=4.0.1" + - "=4.1.0.6" diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp index 9584fafb32..498f1d6f5f 100644 --- a/cpp/include/kvikio/bounce_buffer.hpp +++ b/cpp/include/kvikio/bounce_buffer.hpp @@ -36,6 +36,7 @@ class AllocRetain { // The size of each allocation in `_free_allocs` std::size_t _size{defaults::bounce_buffer_size()}; + public: /** * @brief An host memory allocation */ @@ -56,6 +57,7 @@ class AllocRetain { Alloc& operator=(Alloc&& o) = delete; ~Alloc() noexcept { _manager->put(_alloc, _size); } void* get() noexcept { return _alloc; } + void* get(std::ptrdiff_t offset) noexcept { return static_cast(_alloc) + offset; } std::size_t size() noexcept { return _size; } }; @@ -67,6 +69,7 @@ class AllocRetain { // ~AllocRetain() noexcept = default; + private: /** * @brief Free all retained allocations * diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 19960aaab2..5bb18f6396 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -15,8 +15,10 @@ */ #pragma once +#include #include #include +#include #include #include #include @@ -34,6 +36,98 @@ namespace kvikio { namespace detail { +/** + * @brief Bounce buffer in pinned host memory. + * + * @note Is not thread-safe. + */ +class BounceBufferH2D { + CUstream _stream; // The CUDA steam to use. + CUdeviceptr _dev; // The output device buffer. + AllocRetain::Alloc _host_buffer; // The host buffer to bounce data on. + std::ptrdiff_t _dev_offset{0}; // Number of bytes written to `_dev`. + std::ptrdiff_t _host_offset{0}; // Number of bytes written to `_host` (resets on flush). + + public: + /** + * @brief Create a bounce buffer for an output device buffer. + * + * @param stream The CUDA stream used throughout the lifetime of the bounce buffer. + * @param device_buffer The output device buffer (final destination of the data). + */ + BounceBufferH2D(CUstream stream, void* device_buffer) + : _stream{stream}, + _dev{convert_void2deviceptr(device_buffer)}, + _host_buffer{AllocRetain::instance().get()} + { + } + + /** + * @brief The bounce buffer if flushed to device on destruction. + */ + ~BounceBufferH2D() noexcept + { + try { + flush(); + } catch (CUfileException const& e) { + std::cerr << "BounceBufferH2D error on final flush: "; + std::cerr << e.what(); + std::cerr << std::endl; + } + } + + private: + /** + * @brief Write host memory to the output device buffer. + * + * @param src The host memory source. + * @param size Number of bytes to write. + */ + void write_to_device(void const* src, std::size_t size) + { + if (size > 0) { + CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream)); + CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream)); + _dev_offset += size; + } + } + + /** + * @brief Flush the bounce buffer by writing everything to the output device buffer. + */ + void flush() + { + write_to_device(_host_buffer.get(), _host_offset); + _host_offset = 0; + } + + public: + /** + * @brief Write host memory to the bounce buffer (also host memory). + * + * Only when the bounce buffer has been filled up is data copied to the output device buffer. + * + * @param data The host memory source. + * @param size Number of bytes to write. + */ + void write(char const* data, std::size_t size) + { + if (_host_buffer.size() - _host_offset < size) { // Not enough space left in the bounce buffer + flush(); + assert(_host_offset == 0); + } + if (_host_buffer.size() < size) { + // If still not enough space, we just copy the data to the device. This only happens when + // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance + // optimize for this case. + write_to_device(data, size); + } else if (size > 0) { + std::memcpy(_host_buffer.get(_host_offset), data, size); + _host_offset += size; + } + } +}; + /** * @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks. */ @@ -46,6 +140,7 @@ struct CallbackContext { : buf{static_cast(buf)}, size{size}, offset{0}, overflow_error{0} { } + BounceBufferH2D* bounce_buffer{nullptr}; // Only used by callback_device_memory }; /** @@ -98,12 +193,7 @@ inline std::size_t callback_device_memory(char* data, } KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_device_memory()", nbytes); - CUstream stream = detail::StreamsByThread::get(); - CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( - convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); - // We have to sync since curl might overwrite or free `data`. - CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); - + ctx->bounce_buffer->write(data, nbytes); ctx->offset += nbytes; return nbytes; } @@ -414,6 +504,10 @@ class RemoteHandle { /** * @brief Read from remote source into buffer (host or device memory). * + * When reading into device memory, a bounce buffer is used to avoid many small memory + * copies to device. Use `kvikio::default::bounce_buffer_size_reset()` to set the size + * of this bounce buffer (default 16 MiB). + * * @param buf Pointer to host or device memory. * @param size Number of bytes to read. * @param file_offset File offset in bytes. @@ -450,6 +544,10 @@ class RemoteHandle { curl.perform(); } else { PushAndPopContext c(get_context_from_pointer(buf)); + // We use a bounce buffer to avoid many small memory copies to device. Libcurl has a + // maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller. + detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf); + ctx.bounce_buffer = &bounce_buffer; curl.perform(); } } catch (std::runtime_error const& e) { diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp index cee50c5947..423eff9c60 100644 --- a/cpp/include/kvikio/shim/libcurl.hpp +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -150,7 +150,7 @@ class CurlHandle { /** * @brief Construct a new curl handle. * - * Typically, do not use this directly instead use the `create_curl_handle()` macro. + * Typically, do not call this directly instead use the `create_curl_handle()` macro. * * @param handle An unused curl easy handle pointer, which is retained on destruction. * @param source_file Path of source file of the caller (for error messages). @@ -166,6 +166,7 @@ class CurlHandle { setopt(CURLOPT_NOSIGNAL, 1L); // We always set CURLOPT_ERRORBUFFER to get better error messages. + _errbuf[0] = 0; // Set the error buffer as empty. setopt(CURLOPT_ERRORBUFFER, _errbuf); // Make curl_easy_perform() fail when receiving HTTP code errors. @@ -216,7 +217,7 @@ class CurlHandle { // Perform the curl operation and check for errors. CURLcode err = curl_easy_perform(handle()); if (err != CURLE_OK) { - std::string msg(_errbuf); + std::string msg(_errbuf); // We can do this because we always initialize `_errbuf` as empty. std::stringstream ss; ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; if (msg.empty()) { @@ -249,12 +250,36 @@ class CurlHandle { } }; +namespace detail { +/** + * @brief Fix Conda's manipulation of __FILE__. + * + * Conda manipulates the path information in its shared libraries[1] with the results that the + * C macro `__FILE__` might contain trailing `\0` chars. Normally, this isn't a problem because + * `__FILE__` is a `const char*` that are terminated by the first encounter of `\0`. However, when + * creating a `std::string` from a `char*`, the compiler might optimize the code such that the + * `std::string` is created from the full size of `__FILE__` including the trailing `\0` chars. + * + * The extra `\0` is problematic if `CurlHandle` later throws an exception to Cython since, while + * converting the exception to Python, Cython might truncate the error message. + * + * [1] + */ +__attribute__((noinline)) inline std::string fix_conda_file_path_hack(std::string filename) +{ + if (filename.data() != nullptr) { return std::string{filename.data()}; } + return std::string{}; +} +} // namespace detail + /** * @brief Create a new curl handle. * * @returns A `kvikio::CurlHandle` instance ready to be used. */ -#define create_curl_handle() \ - kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) +#define create_curl_handle() \ + kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), \ + kvikio::detail::fix_conda_file_path_hack(__FILE__), \ + KVIKIO_STRINGIFY(__LINE__)) } // namespace kvikio diff --git a/dependencies.yaml b/dependencies.yaml index 85bf871150..ae99fb5d83 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -267,21 +267,21 @@ dependencies: common: - output_types: conda packages: - - nvcomp==4.0.1 + - nvcomp==4.1.0.6 specific: - output_types: [requirements, pyproject] matrices: - matrix: cuda: "12.*" packages: - - nvidia-nvcomp-cu12==4.0.1 + - nvidia-nvcomp-cu12==4.1.0.6 - matrix: cuda: "11.*" packages: - - nvidia-nvcomp-cu11==4.0.1 + - nvidia-nvcomp-cu11==4.1.0.6 - matrix: packages: - - nvidia-nvcomp==4.0.1 + - nvidia-nvcomp==4.1.0.6 docs: common: - output_types: [conda, requirements] diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index 25a961a858..b30437cec0 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "numcodecs !=0.12.0", "numpy>=1.23,<3.0a0", - "nvidia-nvcomp==4.0.1", + "nvidia-nvcomp==4.1.0.6", "packaging", "zarr", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.