Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into improve_nvtx
Browse files Browse the repository at this point in the history
  • Loading branch information
madsbk authored Oct 31, 2024
2 parents 4d01c21 + b65eda3 commit 48785d3
Show file tree
Hide file tree
Showing 15 changed files with 166 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ repos:
meta[.]yaml$
- id: verify-alpha-spec
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.13.11
rev: v1.16.0
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
Expand Down
4 changes: 4 additions & 0 deletions ci/build_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ rapids-print-env
rapids-logger "Begin cpp build"
conda config --set path_conflict prevent

sccache --zero-stats

rapids-conda-retry mambabuild conda/recipes/libkvikio

sccache --show-adv-stats

rapids-upload-conda-to-s3 cpp
4 changes: 4 additions & 0 deletions ci/build_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ rapids-logger "Begin py build"
CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
conda config --set path_conflict prevent

sccache --zero-stats

rapids-conda-retry mambabuild \
--channel "${CPP_CHANNEL}" \
conda/recipes/kvikio

sccache --show-adv-stats

rapids-upload-conda-to-s3 python
6 changes: 5 additions & 1 deletion ci/build_wheel_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ rapids-generate-version > ./VERSION

cd "${package_dir}"

sccache --zero-stats

python -m pip install wheel
python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check

sccache --show-adv-stats

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
8 changes: 6 additions & 2 deletions ci/build_wheel_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,18 @@ cd "${package_dir}"
# are used when creating the isolated build environment
echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/libkvikio_*.whl)" > ./constraints.txt

sccache --zero-stats

PIP_CONSTRAINT="${PWD}/constraints.txt" \
SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" \
python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check

sccache --show-adv-stats

mkdir -p final_dist
python -m auditwheel repair \
--exclude libnvcomp.so.4 \
-w final_dist \
dist/*

RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies:
- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-aarch64=11.8
- nvcomp==4.0.1
- nvcomp==4.1.0.6
- packaging
- pre-commit
- pytest
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies:
- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-64=11.8
- nvcomp==4.0.1
- nvcomp==4.1.0.6
- packaging
- pre-commit
- pytest
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies:
- numcodecs !=0.12.0
- numpy>=1.23,<3.0a0
- numpydoc
- nvcomp==4.0.1
- nvcomp==4.1.0.6
- packaging
- pre-commit
- pytest
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies:
- numcodecs !=0.12.0
- numpy>=1.23,<3.0a0
- numpydoc
- nvcomp==4.0.1
- nvcomp==4.1.0.6
- packaging
- pre-commit
- pytest
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/kvikio/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ cmake_version:
- ">=3.26.4,!=3.30.0"

nvcomp_version:
- "=4.0.1"
- "=4.1.0.6"
3 changes: 3 additions & 0 deletions cpp/include/kvikio/bounce_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class AllocRetain {
// The size of each allocation in `_free_allocs`
std::size_t _size{defaults::bounce_buffer_size()};

public:
/**
* @brief An host memory allocation
*/
Expand All @@ -56,6 +57,7 @@ class AllocRetain {
Alloc& operator=(Alloc&& o) = delete;
~Alloc() noexcept { _manager->put(_alloc, _size); }
void* get() noexcept { return _alloc; }
void* get(std::ptrdiff_t offset) noexcept { return static_cast<char*>(_alloc) + offset; }
std::size_t size() noexcept { return _size; }
};

Expand All @@ -67,6 +69,7 @@ class AllocRetain {
// <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
~AllocRetain() noexcept = default;

private:
/**
* @brief Free all retained allocations
*
Expand Down
110 changes: 104 additions & 6 deletions cpp/include/kvikio/remote_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
*/
#pragma once

#include <cassert>
#include <cstddef>
#include <cstring>
#include <iostream>
#include <memory>
#include <optional>
#include <regex>
Expand All @@ -34,6 +36,98 @@
namespace kvikio {
namespace detail {

/**
* @brief Bounce buffer in pinned host memory.
*
* @note Is not thread-safe.
*/
class BounceBufferH2D {
CUstream _stream; // The CUDA steam to use.
CUdeviceptr _dev; // The output device buffer.
AllocRetain::Alloc _host_buffer; // The host buffer to bounce data on.
std::ptrdiff_t _dev_offset{0}; // Number of bytes written to `_dev`.
std::ptrdiff_t _host_offset{0}; // Number of bytes written to `_host` (resets on flush).

public:
/**
* @brief Create a bounce buffer for an output device buffer.
*
* @param stream The CUDA stream used throughout the lifetime of the bounce buffer.
* @param device_buffer The output device buffer (final destination of the data).
*/
BounceBufferH2D(CUstream stream, void* device_buffer)
: _stream{stream},
_dev{convert_void2deviceptr(device_buffer)},
_host_buffer{AllocRetain::instance().get()}
{
}

/**
* @brief The bounce buffer if flushed to device on destruction.
*/
~BounceBufferH2D() noexcept
{
try {
flush();
} catch (CUfileException const& e) {
std::cerr << "BounceBufferH2D error on final flush: ";
std::cerr << e.what();
std::cerr << std::endl;
}
}

private:
/**
* @brief Write host memory to the output device buffer.
*
* @param src The host memory source.
* @param size Number of bytes to write.
*/
void write_to_device(void const* src, std::size_t size)
{
if (size > 0) {
CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream));
CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
_dev_offset += size;
}
}

/**
* @brief Flush the bounce buffer by writing everything to the output device buffer.
*/
void flush()
{
write_to_device(_host_buffer.get(), _host_offset);
_host_offset = 0;
}

public:
/**
* @brief Write host memory to the bounce buffer (also host memory).
*
* Only when the bounce buffer has been filled up is data copied to the output device buffer.
*
* @param data The host memory source.
* @param size Number of bytes to write.
*/
void write(char const* data, std::size_t size)
{
if (_host_buffer.size() - _host_offset < size) { // Not enough space left in the bounce buffer
flush();
assert(_host_offset == 0);
}
if (_host_buffer.size() < size) {
// If still not enough space, we just copy the data to the device. This only happens when
// `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance
// optimize for this case.
write_to_device(data, size);
} else if (size > 0) {
std::memcpy(_host_buffer.get(_host_offset), data, size);
_host_offset += size;
}
}
};

/**
* @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks.
*/
Expand All @@ -46,6 +140,7 @@ struct CallbackContext {
: buf{static_cast<char*>(buf)}, size{size}, offset{0}, overflow_error{0}
{
}
BounceBufferH2D* bounce_buffer{nullptr}; // Only used by callback_device_memory
};

/**
Expand Down Expand Up @@ -98,12 +193,7 @@ inline std::size_t callback_device_memory(char* data,
}
KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_device_memory()", nbytes);

CUstream stream = detail::StreamsByThread::get();
CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream));
// We have to sync since curl might overwrite or free `data`.
CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));

ctx->bounce_buffer->write(data, nbytes);
ctx->offset += nbytes;
return nbytes;
}
Expand Down Expand Up @@ -414,6 +504,10 @@ class RemoteHandle {
/**
* @brief Read from remote source into buffer (host or device memory).
*
* When reading into device memory, a bounce buffer is used to avoid many small memory
* copies to device. Use `kvikio::default::bounce_buffer_size_reset()` to set the size
* of this bounce buffer (default 16 MiB).
*
* @param buf Pointer to host or device memory.
* @param size Number of bytes to read.
* @param file_offset File offset in bytes.
Expand Down Expand Up @@ -450,6 +544,10 @@ class RemoteHandle {
curl.perform();
} else {
PushAndPopContext c(get_context_from_pointer(buf));
// We use a bounce buffer to avoid many small memory copies to device. Libcurl has a
// maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller.
detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf);
ctx.bounce_buffer = &bounce_buffer;
curl.perform();
}
} catch (std::runtime_error const& e) {
Expand Down
33 changes: 29 additions & 4 deletions cpp/include/kvikio/shim/libcurl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class CurlHandle {
/**
* @brief Construct a new curl handle.
*
* Typically, do not use this directly instead use the `create_curl_handle()` macro.
* Typically, do not call this directly instead use the `create_curl_handle()` macro.
*
* @param handle An unused curl easy handle pointer, which is retained on destruction.
* @param source_file Path of source file of the caller (for error messages).
Expand All @@ -166,6 +166,7 @@ class CurlHandle {
setopt(CURLOPT_NOSIGNAL, 1L);

// We always set CURLOPT_ERRORBUFFER to get better error messages.
_errbuf[0] = 0; // Set the error buffer as empty.
setopt(CURLOPT_ERRORBUFFER, _errbuf);

// Make curl_easy_perform() fail when receiving HTTP code errors.
Expand Down Expand Up @@ -216,7 +217,7 @@ class CurlHandle {
// Perform the curl operation and check for errors.
CURLcode err = curl_easy_perform(handle());
if (err != CURLE_OK) {
std::string msg(_errbuf);
std::string msg(_errbuf); // We can do this because we always initialize `_errbuf` as empty.
std::stringstream ss;
ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line;
if (msg.empty()) {
Expand Down Expand Up @@ -249,12 +250,36 @@ class CurlHandle {
}
};

namespace detail {
/**
* @brief Fix Conda's manipulation of __FILE__.
*
* Conda manipulates the path information in its shared libraries[1] with the results that the
* C macro `__FILE__` might contain trailing `\0` chars. Normally, this isn't a problem because
* `__FILE__` is a `const char*` that are terminated by the first encounter of `\0`. However, when
* creating a `std::string` from a `char*`, the compiler might optimize the code such that the
* `std::string` is created from the full size of `__FILE__` including the trailing `\0` chars.
*
* The extra `\0` is problematic if `CurlHandle` later throws an exception to Cython since, while
* converting the exception to Python, Cython might truncate the error message.
*
* [1] <https://docs.conda.io/projects/conda-build/en/latest/resources/make-relocatable.html>
*/
__attribute__((noinline)) inline std::string fix_conda_file_path_hack(std::string filename)
{
if (filename.data() != nullptr) { return std::string{filename.data()}; }
return std::string{};
}
} // namespace detail

/**
* @brief Create a new curl handle.
*
* @returns A `kvikio::CurlHandle` instance ready to be used.
*/
#define create_curl_handle() \
kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__))
#define create_curl_handle() \
kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), \
kvikio::detail::fix_conda_file_path_hack(__FILE__), \
KVIKIO_STRINGIFY(__LINE__))

} // namespace kvikio
8 changes: 4 additions & 4 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -267,21 +267,21 @@ dependencies:
common:
- output_types: conda
packages:
- nvcomp==4.0.1
- nvcomp==4.1.0.6
specific:
- output_types: [requirements, pyproject]
matrices:
- matrix:
cuda: "12.*"
packages:
- nvidia-nvcomp-cu12==4.0.1
- nvidia-nvcomp-cu12==4.1.0.6
- matrix:
cuda: "11.*"
packages:
- nvidia-nvcomp-cu11==4.0.1
- nvidia-nvcomp-cu11==4.1.0.6
- matrix:
packages:
- nvidia-nvcomp==4.0.1
- nvidia-nvcomp==4.1.0.6
docs:
common:
- output_types: [conda, requirements]
Expand Down
2 changes: 1 addition & 1 deletion python/kvikio/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
"cupy-cuda11x>=12.0.0",
"numcodecs !=0.12.0",
"numpy>=1.23,<3.0a0",
"nvidia-nvcomp==4.0.1",
"nvidia-nvcomp==4.1.0.6",
"packaging",
"zarr",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
Expand Down

0 comments on commit 48785d3

Please sign in to comment.