cppalliance · mborland · Aug 7, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -76,6 +76,64 @@ jobs:
           cd ../boost-root/__build__
           ctest --output-on-failure --no-tests=error
 
+  nvrtc-cmake-test:
+    strategy:
+      fail-fast: false
+
+    runs-on: gpu-runner-1
+
+    steps:
+      - uses: Jimver/cuda-toolkit@v0.2.16
+        id: cuda-toolkit
+        with:
+          cuda: '12.5.0'
+          method: 'network'
+
+      - name: Output CUDA information
+        run: |
+          echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc -V
+      - uses: actions/checkout@v4
+
+      - name: Install Packages
+        run: |
+          sudo apt-get install -y cmake make
+      - name: Setup Boost
+        run: |
+          echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
+          LIBRARY=${GITHUB_REPOSITORY#*/}
+          echo LIBRARY: $LIBRARY
+          echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
+          echo GITHUB_BASE_REF: $GITHUB_BASE_REF
+          echo GITHUB_REF: $GITHUB_REF
+          REF=${GITHUB_BASE_REF:-$GITHUB_REF}
+          REF=${REF#refs/heads/}
+          echo REF: $REF
+          BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
+          echo BOOST_BRANCH: $BOOST_BRANCH
+          cd ..
+          git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
+          cd boost-root
+          mkdir -p libs/$LIBRARY
+          cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
+          git submodule update --init tools/boostdep
+          python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
+      - name: Configure
+        run: |
+          cd ../boost-root
+          mkdir __build__ && cd __build__
+          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_MATH_NVRTC_CI_RUN=1 ..
+          pwd
+      - name: Build tests
+        run: |
+          cd ../boost-root/__build__
+          cmake --build . --target tests -j $(nproc)
+      - name: Run tests
+        run: |
+          cd ../boost-root/__build__
+          ctest --output-on-failure --no-tests=error
+
   sycl-cmake-test:
     strategy:
       fail-fast: false

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
@@ -14,6 +14,8 @@
 #pragma once
 #endif
 
+#ifndef __CUDACC_RTC__
+
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/fraction.hpp>
@@ -2280,4 +2282,23 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
 #include <boost/math/special_functions/detail/gamma_inva.hpp>
 #include <boost/math/special_functions/erf.hpp>
 
+#else
+
+namespace boost {
+namespace math {
+
+inline __host__ __device__ float tgamma(float x) { return ::tgammaf(x); }
+inline __host__ __device__ double tgamma(double x) { return ::tgamma(x); }
+
+template <typename T, typename Policy>
+inline __host__ __device__ T tgamma(T x, const Policy&)
+{
+   return boost::math::tgamma(x);
+}
+
+} // namespace math
+} // namespace boost
+
+#endif // __CUDACC_RTC__
+
 #endif // BOOST_MATH_SF_GAMMA_HPP
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -16,6 +16,23 @@ if(HAVE_BOOST_TEST)
         enable_testing()
 
         boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception Boost::unit_test_framework )
+
+    elseif (BOOST_MATH_ENABLE_NVRTC)
+
+        message(STATUS "Building boost.cuda_math with NVRTC")
+
+        find_package(CUDA REQUIRED)
+
+        enable_testing()
+
+        set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so)
+
+        if (BOOST_MATH_NVRTC_CI_RUN)
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_MATH_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+        else ()
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+        endif()
+
     elseif (BOOST_MATH_ENABLE_SYCL)
 
         message(STATUS "Building boost.cuda_math with SYCL")

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
@@ -0,0 +1,14 @@
+# Copyright 2024 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# https://www.boost.org/LICENSE_1_0.txt
+
+import testing ;
+import ../../config/checks/config : requires ;
+
+project : requirements
+    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
+    ;
+
+# Special Functions
+run test_gamma_nvrtc_double.cpp ;
+run test_gamma_nvrtc_float.cpp ;
diff --git a/test/test_gamma_nvrtc_double.cpp b/test/test_gamma_nvrtc_double.cpp
@@ -0,0 +1,186 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <boost/math/special_functions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = boost::math::tgamma(h_in1[i]);
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}