Skip to content

Commit

Permalink
Merge pull request #8 from cppalliance/NVRTC
Browse files Browse the repository at this point in the history
Add NVRTC Runner
  • Loading branch information
mborland authored Aug 7, 2024
2 parents 8fd701b + 3f0cc1e commit 253ad83
Show file tree
Hide file tree
Showing 6 changed files with 482 additions and 0 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,64 @@ jobs:
cd ../boost-root/__build__
ctest --output-on-failure --no-tests=error
nvrtc-cmake-test:
strategy:
fail-fast: false

runs-on: gpu-runner-1

steps:
- uses: Jimver/cuda-toolkit@v0.2.16
id: cuda-toolkit
with:
cuda: '12.5.0'
method: 'network'

- name: Output CUDA information
run: |
echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
nvcc -V
- uses: actions/checkout@v4

- name: Install Packages
run: |
sudo apt-get install -y cmake make
- name: Setup Boost
run: |
echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
LIBRARY=${GITHUB_REPOSITORY#*/}
echo LIBRARY: $LIBRARY
echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
echo GITHUB_BASE_REF: $GITHUB_BASE_REF
echo GITHUB_REF: $GITHUB_REF
REF=${GITHUB_BASE_REF:-$GITHUB_REF}
REF=${REF#refs/heads/}
echo REF: $REF
BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
echo BOOST_BRANCH: $BOOST_BRANCH
cd ..
git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
cd boost-root
mkdir -p libs/$LIBRARY
cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
git submodule update --init tools/boostdep
python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
- name: Configure
run: |
cd ../boost-root
mkdir __build__ && cd __build__
cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_MATH_NVRTC_CI_RUN=1 ..
pwd
- name: Build tests
run: |
cd ../boost-root/__build__
cmake --build . --target tests -j $(nproc)
- name: Run tests
run: |
cd ../boost-root/__build__
ctest --output-on-failure --no-tests=error
sycl-cmake-test:
strategy:
fail-fast: false
Expand Down
21 changes: 21 additions & 0 deletions include/boost/math/special_functions/gamma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#pragma once
#endif

#ifndef __CUDACC_RTC__

#include <boost/math/tools/config.hpp>
#include <boost/math/tools/series.hpp>
#include <boost/math/tools/fraction.hpp>
Expand Down Expand Up @@ -2280,4 +2282,23 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
#include <boost/math/special_functions/detail/gamma_inva.hpp>
#include <boost/math/special_functions/erf.hpp>

#else

namespace boost {
namespace math {

inline __host__ __device__ float tgamma(float x) { return ::tgammaf(x); }
inline __host__ __device__ double tgamma(double x) { return ::tgamma(x); }

template <typename T, typename Policy>
inline __host__ __device__ T tgamma(T x, const Policy&)
{
return boost::math::tgamma(x);
}

} // namespace math
} // namespace boost

#endif // __CUDACC_RTC__

#endif // BOOST_MATH_SF_GAMMA_HPP
17 changes: 17 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ if(HAVE_BOOST_TEST)
enable_testing()

boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception Boost::unit_test_framework )

elseif (BOOST_MATH_ENABLE_NVRTC)

message(STATUS "Building boost.cuda_math with NVRTC")

find_package(CUDA REQUIRED)

enable_testing()

set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so)

if (BOOST_MATH_NVRTC_CI_RUN)
boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_MATH_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
else ()
boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::cuda_math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
endif()

elseif (BOOST_MATH_ENABLE_SYCL)

message(STATUS "Building boost.cuda_math with SYCL")
Expand Down
14 changes: 14 additions & 0 deletions test/nvrtc_jamfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright 2024 Matt Borland
# Distributed under the Boost Software License, Version 1.0.
# https://www.boost.org/LICENSE_1_0.txt

import testing ;
import ../../config/checks/config : requires ;

project : requirements
[ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
;

# Special Functions
run test_gamma_nvrtc_double.cpp ;
run test_gamma_nvrtc_float.cpp ;
186 changes: 186 additions & 0 deletions test/test_gamma_nvrtc_double.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false

#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/gamma.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

typedef double float_type;

const char* cuda_kernel = R"(
typedef double float_type;
#include <boost/math/special_functions/gamma.hpp>
extern "C" __global__
void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::tgamma(in1[i]);
}
}
)";

void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}

void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}

void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}

int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");

// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");

nvrtcProgram prog;
nvrtcResult res;

res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");

nvrtcAddNameExpression(prog, "test_gamma_kernel");

#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"};
#endif

// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}

// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);

// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");

int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;

// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];

// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}

checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");

checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");

int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");

checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");

// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = boost::math::tgamma(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}

cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;

nvrtcDestroyProgram(&prog);
delete[] ptx;

cuCtxDestroy(context);

std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}
Loading

0 comments on commit 253ad83

Please sign in to comment.