diff --git a/cpp/include/kvikio/cufile_config.hpp b/cpp/include/kvikio/cufile_config.hpp
index 85d6cf57ed..b1457880bb 100644
--- a/cpp/include/kvikio/cufile_config.hpp
+++ b/cpp/include/kvikio/cufile_config.hpp
@@ -19,6 +19,8 @@
 #include <filesystem>
 #include <string>
 
+#include <kvikio/utils.hpp>
+
 namespace kvikio {
 namespace detail {
 
@@ -39,7 +41,7 @@ namespace detail {
  *
  * @return The filepath to the cufile.json file or the empty string if it isn't found.
  */
-[[nodiscard]] inline const std::string& config_path()
+[[nodiscard]] KVIKIO_EXPORT inline const std::string& config_path()
 {
   static const std::string ret = detail::lookup_config_path();
   return ret;
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 119197bfc4..c812c6e251 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -21,7 +21,6 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <utility>
 
 #include <BS_thread_pool.hpp>
 
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index fbd510d86f..f84e792489 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -286,7 +286,7 @@ class FileHandle {
    *
    * @return The number of bytes
    */
-  [[nodiscard]] inline std::size_t nbytes() const
+  [[nodiscard]] std::size_t nbytes() const
   {
     if (closed()) { return 0; }
     if (_nbytes == 0) { _nbytes = detail::get_file_size(_fd_direct_off); }
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 918983a919..7a54b2793b 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -28,6 +28,20 @@
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 
+// Macros used for defining symbol visibility, only GLIBC is supported.
+// Since KvikIO is header-only, we rely on the linker to disambiguate inline functions
+// that have (or return) static references. To do this, the relevant function must have
+// `__attribute__((visibility("default")))`. If not, then if KvikIO is used in two
+// different DSOs, the function will appear twice, and there will be two static objects.
+// See <https://github.com/rapidsai/kvikio/issues/442>.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MINGW32__) && !defined(__MINGW64__)
+#define KVIKIO_EXPORT __attribute__((visibility("default")))
+#define KVIKIO_HIDDEN __attribute__((visibility("hidden")))
+#else
+#define KVIKIO_EXPORT
+#define KVIKIO_HIDDEN
+#endif
+
 namespace kvikio {
 
 // cuFile defines a page size to 4 KiB
@@ -101,33 +115,6 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
   return ret;
 }
 
-/**
- * @brief RAII wrapper for a CUDA primary context
- */
-class CudaPrimaryContext {
- public:
-  CUdevice dev{};
-  CUcontext ctx{};
-
-  CudaPrimaryContext(int device_ordinal)
-  {
-    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
-    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
-  }
-  CudaPrimaryContext(const CudaPrimaryContext&)            = delete;
-  CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
-  CudaPrimaryContext(CudaPrimaryContext&&)                 = delete;
-  CudaPrimaryContext&& operator=(CudaPrimaryContext&&)     = delete;
-  ~CudaPrimaryContext()
-  {
-    try {
-      CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
-    } catch (const CUfileException& e) {
-      std::cerr << e.what() << std::endl;
-    }
-  }
-};
-
 /**
  * @brief Given a device ordinal, return the primary context of the device.
  *
@@ -136,11 +123,24 @@ class CudaPrimaryContext {
  * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
  * @return Primary CUDA context
  */
-[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
+[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal)
 {
-  static std::map<int, CudaPrimaryContext> _primary_contexts;
-  _primary_contexts.try_emplace(ordinal, ordinal);
-  return _primary_contexts.at(ordinal).ctx;
+  static std::map<int, CUcontext> _cache;
+  static std::mutex _mutex;
+  std::lock_guard const lock(_mutex);
+
+  if (_cache.find(ordinal) == _cache.end()) {
+    CUdevice dev{};
+    CUcontext ctx{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
+
+    // Notice, we let the primary context leak at program exit. We do this because `_cache`
+    // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main:
+    // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
+    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
+    _cache.emplace(ordinal, ctx);
+  }
+  return _cache.at(ordinal);
 }
 
 /**