From fc56da633a836e4bfbeab92b4f469996e3ffed44 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 31 Oct 2024 04:59:12 -0400 Subject: [PATCH] Use registered strings for NVTX. Add more NVTX annotations. (#518) This PR makes some improvements on NVTX annotations in KvikIO, including: - Improve NVTX range annotation by registering the range name string (which reduces profiling-time overhead) and making the registered string static (which further reduces the overhead). - Separate the responsibility of "scoped range" from "function range" to improve clarity. Previously variadic macro `KVIKIO_NVTX_FUNC_RANGE(...)` is used for both the function range and arbitrary scoped range. However, the macro is somewhat a misnomer for the latter case, where `__func__` is not used in the message. This PR introduces `KVIKIO_NVTX_SCOPED_RANGE(msg, val)` for the latter case. - Add NVTX marker macro `KVIKIO_NVTX_MARKER(message, payload)` to annotate an instantaneous event. One use case is to characterize asynchronous I/O operations (where the scoped range in a function does not apply): Once the profile is collected, the `nsys-rep` file can be exported, for instance, as a `sqlite`, and the marker data can be used to make a I/O-size histogram. As an example, for `PARQUET_READER_NVBENCH---parquet_read_io_compression`, the following I/O statistics can be obtained. This histogram is generated using the Python script `python/kvikio/examples/kvikio_stat.py`. PR link: TBD... ``` FileHandle::pread() Bins ...... Count [ 0 B, 2 B) ...... 0 [ 2 B, 4 B) ...... 0 [ 4 B, 8 B) ...... 0 [ 8 B, 16 B) ...... 0 [ 16 B, 32 B) ...... 0 [ 32 B, 64 B) ...... 0 [ 64 B, 128 B) ...... 0 [ 128 B, 256 B) ...... 0 [ 256 B, 512 B) ...... 0 [ 512 B, 1024 B) ...... 0 [ 1024 B, 2 KB) ...... 0 [ 2 KB, 4 KB) ...... 0 [ 4 KB, 8 KB) ...... 0 [ 8 KB, 16 KB) ...... 0 [ 16 KB, 32 KB) ...... 0 [ 32 KB, 64 KB) ...... 0 [ 64 KB, 128 KB) ...... 0 [ 128 KB, 256 KB) ...... 45 [ 256 KB, 512 KB) ...... 30 [ 512 KB, 1024 KB) ...... 15 [ 1024 KB, 2 MB) ...... 120 [ 2 MB, 4 MB) ...... 150 [ 4 MB, 8 MB) ...... 60 [ 8 MB, 16 MB) ...... 0 [ 16 MB, 32 MB) ...... 0 [ 32 MB, 64 MB) ...... 0 [ 64 MB, 128 MB) ...... 0 [ 128 MB, 256 MB] ...... 30 FileHandle::pwrite() Bins ...... Count [ 0 B, 2 B) ...... 0 [ 2 B, 4 B) ...... 0 [ 4 B, 8 B) ...... 0 [ 8 B, 16 B) ...... 0 [ 16 B, 32 B) ...... 0 [ 32 B, 64 B) ...... 0 [ 64 B, 128 B) ...... 0 [ 128 B, 256 B) ...... 0 [ 256 B, 512 B) ...... 0 [ 512 B, 1024 B) ...... 0 [ 1024 B, 2 KB) ...... 0 [ 2 KB, 4 KB) ...... 0 [ 4 KB, 8 KB) ...... 0 [ 8 KB, 16 KB) ...... 0 [ 16 KB, 32 KB) ...... 0 [ 32 KB, 64 KB) ...... 0 [ 64 KB, 128 KB) ...... 0 [ 128 KB, 256 KB) ...... 8 [ 256 KB, 512 KB) ...... 8 [ 512 KB, 1024 KB) ...... 19 [ 1024 KB, 2 MB) ...... 24 [ 2 MB, 4 MB) ...... 4 [ 4 MB, 8 MB) ...... 0 [ 8 MB, 16 MB) ...... 0 [ 16 MB, 32 MB) ...... 0 [ 32 MB, 64 MB) ...... 0 [ 64 MB, 128 MB) ...... 0 [ 128 MB, 256 MB] ...... 2 posix_device_read() Bins ...... Count [ 0 B, 2 B) ...... 0 [ 2 B, 4 B) ...... 0 [ 4 B, 8 B) ...... 0 [ 8 B, 16 B) ...... 0 [ 16 B, 32 B) ...... 0 [ 32 B, 64 B) ...... 0 [ 64 B, 128 B) ...... 0 [ 128 B, 256 B) ...... 0 [ 256 B, 512 B) ...... 0 [ 512 B, 1024 B) ...... 0 [ 1024 B, 2 KB) ...... 0 [ 2 KB, 4 KB) ...... 0 [ 4 KB, 8 KB) ...... 0 [ 8 KB, 16 KB) ...... 0 [ 16 KB, 32 KB) ...... 0 [ 32 KB, 64 KB) ...... 0 [ 64 KB, 128 KB) ...... 0 [ 128 KB, 256 KB) ...... 75 [ 256 KB, 512 KB) ...... 60 [ 512 KB, 1024 KB) ...... 15 [ 1024 KB, 2 MB) ...... 150 [ 2 MB, 4 MB] ...... 1710 posix_device_write() Bins ...... Count [ 0 B, 2 B) ...... 0 [ 2 B, 4 B) ...... 0 [ 4 B, 8 B) ...... 0 [ 8 B, 16 B) ...... 0 [ 16 B, 32 B) ...... 0 [ 32 B, 64 B) ...... 0 [ 64 B, 128 B) ...... 0 [ 128 B, 256 B) ...... 0 [ 256 B, 512 B) ...... 0 [ 512 B, 1024 B) ...... 0 [ 1024 B, 2 KB) ...... 0 [ 2 KB, 4 KB) ...... 0 [ 4 KB, 8 KB) ...... 0 [ 8 KB, 16 KB) ...... 0 [ 16 KB, 32 KB) ...... 0 [ 32 KB, 64 KB) ...... 0 [ 64 KB, 128 KB) ...... 0 [ 128 KB, 256 KB) ...... 8 [ 256 KB, 512 KB) ...... 8 [ 512 KB, 1024 KB) ...... 19 [ 1024 KB, 2 MB) ...... 24 [ 2 MB, 4 MB] ...... 104 ``` Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Mads R. B. Kristensen (https://github.com/madsbk) URL: https://github.com/rapidsai/kvikio/pull/518 --- cpp/include/kvikio/file_handle.hpp | 6 +- cpp/include/kvikio/posix_io.hpp | 8 +-- cpp/include/kvikio/remote_handle.hpp | 8 +-- cpp/include/kvikio/utils.hpp | 99 ++++++++++++++++++++++------ 4 files changed, 90 insertions(+), 31 deletions(-) diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index 19445f1333..7c3e1c92a3 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -335,7 +335,7 @@ class FileHandle { } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_FUNC_RANGE("cufileRead()", size); + KVIKIO_NVTX_SCOPED_RANGE("cufileRead()", size); ssize_t ret = cuFileAPI::instance().Read( _handle, devPtr_base, size, convert_size2off(file_offset), convert_size2off(devPtr_offset)); CUFILE_CHECK_BYTES_DONE(ret); @@ -387,7 +387,7 @@ class FileHandle { } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_FUNC_RANGE("cufileWrite()", size); + KVIKIO_NVTX_SCOPED_RANGE("cufileWrite()", size); ssize_t ret = cuFileAPI::instance().Write( _handle, devPtr_base, size, convert_size2off(file_offset), convert_size2off(devPtr_offset)); if (ret == -1) { @@ -434,6 +434,7 @@ class FileHandle { std::size_t gds_threshold = defaults::gds_threshold(), bool sync_default_stream = true) { + KVIKIO_NVTX_MARKER("FileHandle::pread()", size); if (is_host_memory(buf)) { auto op = [this](void* hostPtr_base, std::size_t size, @@ -510,6 +511,7 @@ class FileHandle { std::size_t gds_threshold = defaults::gds_threshold(), bool sync_default_stream = true) { + KVIKIO_NVTX_MARKER("FileHandle::pwrite()", size); if (is_host_memory(buf)) { auto op = [this](const void* hostPtr_base, std::size_t size, diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp index 0437ef69f8..4327a301ec 100644 --- a/cpp/include/kvikio/posix_io.hpp +++ b/cpp/include/kvikio/posix_io.hpp @@ -211,7 +211,7 @@ std::size_t posix_device_io(int fd, template std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t file_offset) { - KVIKIO_NVTX_FUNC_RANGE("posix_host_read()", size); + KVIKIO_NVTX_SCOPED_RANGE("posix_host_read()", size); return detail::posix_host_io( fd, buf, size, convert_size2off(file_offset)); } @@ -233,7 +233,7 @@ std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t fil template std::size_t posix_host_write(int fd, const void* buf, std::size_t size, std::size_t file_offset) { - KVIKIO_NVTX_FUNC_RANGE("posix_host_write()", size); + KVIKIO_NVTX_SCOPED_RANGE("posix_host_write()", size); return detail::posix_host_io( fd, buf, size, convert_size2off(file_offset)); } @@ -257,7 +257,7 @@ inline std::size_t posix_device_read(int fd, std::size_t file_offset, std::size_t devPtr_offset) { - KVIKIO_NVTX_FUNC_RANGE("posix_device_read()", size); + KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size); return detail::posix_device_io( fd, devPtr_base, size, file_offset, devPtr_offset); } @@ -281,7 +281,7 @@ inline std::size_t posix_device_write(int fd, std::size_t file_offset, std::size_t devPtr_offset) { - KVIKIO_NVTX_FUNC_RANGE("posix_device_write()", size); + KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size); return detail::posix_device_io( fd, devPtr_base, size, file_offset, devPtr_offset); } diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index bff96ce0ad..5bb18f6396 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -164,7 +164,7 @@ inline std::size_t callback_host_memory(char* data, ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; } - KVIKIO_NVTX_FUNC_RANGE("RemoteHandle - callback_host_memory()", nbytes); + KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_host_memory()", nbytes); std::memcpy(ctx->buf + ctx->offset, data, nbytes); ctx->offset += nbytes; return nbytes; @@ -191,7 +191,7 @@ inline std::size_t callback_device_memory(char* data, ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; } - KVIKIO_NVTX_FUNC_RANGE("RemoteHandle - callback_device_memory()", nbytes); + KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_device_memory()", nbytes); ctx->bounce_buffer->write(data, nbytes); ctx->offset += nbytes; @@ -515,7 +515,7 @@ class RemoteHandle { */ std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0) { - KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::read()", size); + KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::read()", size); if (file_offset + size > _nbytes) { std::stringstream ss; @@ -578,7 +578,7 @@ class RemoteHandle { std::size_t file_offset = 0, std::size_t task_size = defaults::task_size()) { - KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::pread()", size); + KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::pread()", size); auto task = [this](void* devPtr_base, std::size_t size, std::size_t file_offset, diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 4c43326063..3cad457ffa 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -287,47 +287,104 @@ struct libkvikio_domain { static constexpr char const* name{"libkvikio"}; }; +// Macro to concatenate two tokens x and y. +#define KVIKIO_CONCAT_HELPER(x, y) x##y +#define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y) + +// Macro to create a static, registered string that will not have a name conflict with any +// registered string defined in the same scope. +#define KVIKIO_REGISTER_STRING(msg) \ + [](const char* a_msg) -> auto& { \ + static nvtx3::registered_string_in a_reg_str{a_msg}; \ + return a_reg_str; \ + }(msg) + // Macro overloads of KVIKIO_NVTX_FUNC_RANGE -#define KVIKIO_NVTX_FUNC_RANGE_1() NVTX3_FUNC_RANGE_IN(libkvikio_domain) -#define KVIKIO_NVTX_FUNC_RANGE_2(msg, val) \ - nvtx3::scoped_range_in _kvikio_nvtx_range \ - { \ - nvtx3::event_attributes \ - { \ - msg, nvtx3::payload { convert_to_64bit(val) } \ - } \ +#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) + +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) \ + nvtx3::scoped_range_in KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) } \ + } \ } -#define GET_KVIKIO_NVTX_FUNC_RANGE_MACRO(_1, _2, NAME, ...) NAME + +#define KVIKIO_NVTX_MARKER_IMPL(msg, val) \ + nvtx3::mark_in( \ + nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}}) + #endif /** * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain * from the lifetime of a function. * - * Takes two arguments (message, payload) or no arguments, in which case the name - * of the immediately enclosing function returned by `__func__` is used. + * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used + * as the message. * * Example: * ``` - * void some_function1(){ - * KVIKIO_NVTX_FUNC_RANGE("my function", 42); - * ... - * } - * void some_function2(){ - * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function2` is used + * void some_function(){ + * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function` is used as the message * ... * } * ``` */ #ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_FUNC_RANGE(...) \ - GET_KVIKIO_NVTX_FUNC_RANGE_MACRO( \ - __VA_ARGS__, KVIKIO_NVTX_FUNC_RANGE_2, KVIKIO_NVTX_FUNC_RANGE_1) \ - (__VA_ARGS__) +#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL() #else #define KVIKIO_NVTX_FUNC_RANGE(...) \ do { \ } while (0) #endif +/** + * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to + * annotate a time duration. + * + * Takes two arguments (message, payload). + * + * Example: + * ``` + * void some_function(){ + * KVIKIO_NVTX_SCOPED_RANGE("my function", 42); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) +#else +#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \ + do { \ + } while (0) +#endif + +/** + * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a + * certain time point. + * + * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O operations, + * where the payload refers to the I/O size. + * + * Example: + * ``` + * std::future some_function(){ + * size_t io_size{2077}; + * KVIKIO_NVTX_MARKER("I/O operation", io_size); + * perform_async_io_operation(io_size); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload) +#else +#define KVIKIO_NVTX_MARKER(message, payload) \ + do { \ + } while (0) +#endif + } // namespace kvikio