From 885db5b998f83d47c80017bd8651055cec52e6cc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 10:58:21 -0400 Subject: [PATCH 01/61] Fix typo --- include/boost/math/tools/config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 2736d660f..3e5ba5ac3 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -733,7 +733,7 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; template BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_min(const T& a, const T& b) { return a < b ? a : b; } template -BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { return a > b ? a : b; } +BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return a > b ? a : b; } #define BOOST_MATH_GPU_SAFE_SWAP(a, b) gpu_safe_swap(a, b) #define BOOST_MATH_GPU_SAFE_MIN(a, b) gpu_safe_min(a, b) From b356bd31747a60549aff81e74ba4063d3698dd87 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 10:58:38 -0400 Subject: [PATCH 02/61] Add GPU support to bessel_i0 --- .../special_functions/detail/bessel_i0.hpp | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_i0.hpp b/include/boost/math/special_functions/detail/bessel_i0.hpp index af6e8c379..9f14fad44 100644 --- a/include/boost/math/special_functions/detail/bessel_i0.hpp +++ b/include/boost/math/special_functions/detail/bessel_i0.hpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -35,24 +38,24 @@ namespace boost { namespace math { namespace detail{ template -T bessel_i0(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_i0(const T& x); template -T bessel_i0_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) { // Max error in interpolated form: 3.929e-08 // Max Error found at float precision = Poly: 1.991226e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 1.00000003928615375e+00f, 2.49999576572179639e-01f, 2.77785268558399407e-02f, @@ -70,7 +73,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 5.195e-08 // Max Error found at float precision = Poly: 8.502534e-08 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942651588301770e-01f, 4.98327234176892844e-02f, 2.91866904423115499e-02f, @@ -83,7 +86,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 1.782e-09 // Max Error found at float precision = Poly: 6.473568e-08 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942391532752700e-01f, 4.98455950638200020e-02f, 2.94835666900682535e-02f @@ -96,7 +99,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -104,7 +107,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Bessel I0 over[10 ^ -16, 7.75] // Max error in interpolated form : 3.042e-18 // Max Error found at double precision = Poly : 5.106609e-16 Cheb : 5.239199e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 1.00000000000000000e+00, 2.49999999999999909e-01, 2.77777777777782257e-02, @@ -128,7 +131,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form : 1.685e-16 // Max Error found at double precision = Poly : 2.575063e-16 Cheb : 2.247615e+00 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.98942280401425088e-01, 4.98677850604961985e-02, 2.80506233928312623e-02, @@ -158,7 +161,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form : 2.437e-18 // Max Error found at double precision = Poly : 1.216719e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.98942280401432905e-01, 4.98677850491434560e-02, 2.80506308916506102e-02, @@ -173,7 +176,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -182,7 +185,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 3.899e-20 // Max Error found at float80 precision = Poly : 1.770840e-19 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 9.99999999999999999961011629e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.50000000000000001321873912e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.77777777777777703400424216e-02), @@ -211,8 +214,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 1.631e-04 // Max Error found at float80 precision = Poly : 7.811948e-21 // LCOV_EXCL_START - static const T Y = 4.051098823547363281250e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.051098823547363281250e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -6.158081780620616479492e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.883635969834048766148e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 7.892782002476195771920e-02), @@ -237,8 +240,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 1.304e-03 // Max Error found at float80 precision = Poly : 2.303527e-20 // LCOV_EXCL_START - static const T Y = 4.033188819885253906250e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.033188819885253906250e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -4.376373876116109401062e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.982899138682911273321e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 3.109477529533515397644e-02), @@ -262,8 +265,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.035e-21 // Max Error found at float80 precision = Poly: 1.885872e-21 // LCOV_EXCL_START - static const T Y = 4.011702537536621093750e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.011702537536621093750e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -2.227973351806078464328e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.986778486088017419036e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 2.805066823812285310011e-02), @@ -291,7 +294,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 5.587e-20 // Max Error found at float80 precision = Poly : 8.776852e-20 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677955074061e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 4.98677850501789875615574058e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 2.80506290908675604202206833e-02), @@ -320,7 +323,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -329,7 +332,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.274e-34 // Max Error found at float128 precision = Poly : 3.096091e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.0000000000000000000000000000000001273856e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 2.4999999999999999999999999999999107477496e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777777777777777881795230918e-02), @@ -364,7 +367,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 7.534e-35 // Max Error found at float128 precision = Poly : 6.123912e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 9.9999999999999999992388573069504617493518e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.5000000000000000007304739268173096975340e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777744261405400543564492074e-02), @@ -403,7 +406,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.808e-34 // Max Error found at float128 precision = Poly : 2.399403e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040870793650581242239624530714032e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867780576714783790784348982178607842250e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8051948347934462928487999569249907599510e-02), @@ -445,7 +448,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.487e-34 // Max Error found at float128 precision = Poly : 1.929924e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793996798658172135362278e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084714910130342157246539820e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725751585266360464766768437048e-02), @@ -480,7 +483,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 5.459e-35 // Max Error found at float128 precision = Poly : 1.472240e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438166526772e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084742493257495245185241487e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725735167652437695397756897920e-02), @@ -507,33 +510,33 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_i0(const T& x) +BOOST_MATH_GPU_ENABLED inline T bessel_i0(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; From 7844bd0a6dd1b84b14ff028f955d7985aeb05fb4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 10:58:54 -0400 Subject: [PATCH 03/61] Add CUDA and NVRTC testing --- test/cuda_jamfile | 31 ++++- test/nvrtc_jamfile | 17 +++ test/test_bessel_i0_double.cu | 100 ++++++++++++++ test/test_bessel_i0_float.cu | 100 ++++++++++++++ test/test_bessel_i0_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_i0_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 625 insertions(+), 3 deletions(-) create mode 100644 test/test_bessel_i0_double.cu create mode 100644 test/test_bessel_i0_float.cu create mode 100644 test/test_bessel_i0_nvrtc_double.cpp create mode 100644 test/test_bessel_i0_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 11f2c7616..9fdb4cd23 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -18,12 +18,14 @@ run test_arcsine_pdf_float.cu ; run test_arcsine_quan_double.cu ; run test_arcsine_quan_float.cu ; run test_arcsine_range_support_double.cu ; + run test_bernoulli_cdf_double.cu ; run test_bernoulli_cdf_float.cu ; run test_bernoulli_pdf_double.cu ; run test_bernoulli_pdf_float.cu ; run test_bernoulli_range_support_double.cu ; run test_bernoulli_range_support_float.cu ; + run test_cauchy_cdf_double.cu ; run test_cauchy_cdf_float.cu ; run test_cauchy_pdf_double.cu ; @@ -32,6 +34,7 @@ run test_cauchy_quan_double.cu ; run test_cauchy_quan_float.cu ; run test_cauchy_range_support_double.cu ; run test_cauchy_range_support_float.cu ; + run test_exponential_cdf_double.cu ; run test_exponential_cdf_float.cu ; run test_exponential_pdf_double.cu ; @@ -40,40 +43,47 @@ run test_exponential_quan_double.cu ; run test_exponential_quan_float.cu ; run test_exponential_range_support_double.cu ; run test_exponential_range_support_float.cu ; + run test_extreme_value_cdf_double.cu ; run test_extreme_value_cdf_float.cu ; run test_extreme_value_pdf_double.cu ; run test_extreme_value_pdf_float.cu ; run test_extreme_value_quan_double.cu ; run test_extreme_value_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; run test_holtsmark_pdf_float.cu ; + run test_landau_cdf_double.cu ; run test_landau_cdf_float.cu ; run test_landau_pdf_double.cu ; run test_landau_pdf_float.cu ; run test_landau_quan_double.cu; run test_landau_quan_float.cu ; + run test_laplace_cdf_double.cu ; run test_laplace_cdf_float.cu ; run test_laplace_pdf_double.cu ; run test_laplace_pdf_float.cu ; run test_laplace_quan_double.cu ; run test_laplace_quan_float.cu ; + run test_logistic_cdf_double.cu ; run test_logistic_cdf_float.cu ; run test_logistic_pdf_double.cu ; run test_logistic_pdf_float.cu ; run test_logistic_quan_double.cu ; run test_logistic_quan_float.cu ; + run test_mapairy_cdf_double.cu ; run test_mapairy_cdf_float.cu ; run test_mapairy_pdf_double.cu ; run test_mapairy_pdf_float.cu ; run test_mapairy_quan_double.cu ; run test_mapairy_quan_float.cu ; + run test_saspoint5_cdf_double.cu ; run test_saspoint5_cdf_float.cu ; run test_saspoint5_pdf_double.cu ; @@ -82,17 +92,24 @@ run test_saspoint5_quan_double.cu ; run test_saspoint5_quan_float.cu ; # Special Functions -# run test_beta_simple.cpp ; run test_beta_double.cu ; run test_beta_float.cu ; + +run test_bessel_i0_double.cu ; +run test_bessel_i0_float.cu ; + run test_cbrt_double.cu ; run test_cbrt_float.cu ; + run test_changesign_double.cu ; run test_changesign_float.cu ; + run test_cos_pi_double.cu ; run test_cos_pi_float.cu ; + run test_digamma_double.cu ; run test_digamma_float.cu ; + run test_erf_double.cu ; run test_erf_float.cu ; run test_erf_inv_double.cu ; @@ -101,21 +118,29 @@ run test_erfc_double.cu ; run test_erfc_float.cu ; run test_erfc_inv_double.cu ; run test_erfc_inv_float.cu ; + run test_expm1_double.cu ; run test_expm1_float.cu ; + run test_lgamma_double.cu ; run test_lgamma_float.cu ; +run test_tgamma_double.cu ; +run test_tgamma_float.cu ; + run test_log1p_double.cu ; run test_log1p_float.cu ; + run test_modf_double.cu ; run test_modf_float.cu ; + run test_round_double.cu ; run test_round_float.cu ; + run test_sin_pi_double.cu ; run test_sin_pi_float.cu ; -run test_tgamma_double.cu ; -run test_tgamma_float.cu ; + run test_trigamma_double.cu ; run test_trigamma_float.cu ; + run test_trunc_double.cu ; run test_trunc_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 7e57f93ce..37c147f8f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -90,12 +90,19 @@ run test_saspoint5_quan_nvrtc_float.cpp ; # Special Functions run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; + +run test_bessel_i0_nvrtc_double.cpp ; +run test_bessel_i0_nvrtc_float.cpp ; + run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; + run test_cos_pi_nvrtc_double.cpp ; run test_cos_pi_nvrtc_float.cpp ; + run test_digamma_nvrtc_double.cpp ; run test_digamma_nvrtc_float.cpp ; + run test_erf_nvrtc_double.cpp ; run test_erf_nvrtc_float.cpp ; run test_erfc_nvrtc_double.cpp ; @@ -104,22 +111,32 @@ run test_erf_inv_nvrtc_double.cpp ; run test_erf_inv_nvrtc_float.cpp ; run test_erfc_inv_nvrtc_double.cpp ; run test_erfc_inv_nvrtc_float.cpp ; + run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; + run test_fpclassify_nvrtc_double.cpp ; run test_fpclassify_nvrtc_float.cpp ; + run test_gamma_nvrtc_double.cpp ; run test_gamma_nvrtc_float.cpp ; + run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; + run test_modf_nvrtc_double.cpp ; run test_modf_nvrtc_float.cpp ; + run test_round_nvrtc_double.cpp ; run test_round_nvrtc_float.cpp ; + run test_sign_nvrtc_double.cpp ; run test_sign_nvrtc_float.cpp ; + run test_sin_pi_nvrtc_double.cpp ; run test_sin_pi_nvrtc_float.cpp ; + run test_trigamma_nvrtc_double.cpp ; run test_trigamma_nvrtc_float.cpp ; + run test_trunc_nvrtc_double.cpp ; diff --git a/test/test_bessel_i0_double.cu b/test/test_bessel_i0_double.cu new file mode 100644 index 000000000..1c5d0ca14 --- /dev/null +++ b/test/test_bessel_i0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i0_float.cu b/test/test_bessel_i0_float.cu new file mode 100644 index 000000000..39929d548 --- /dev/null +++ b/test/test_bessel_i0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i0_nvrtc_double.cpp b/test/test_bessel_i0_nvrtc_double.cpp new file mode 100644 index 000000000..0c5db47b4 --- /dev/null +++ b/test/test_bessel_i0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_i0_nvrtc_float.cpp b/test/test_bessel_i0_nvrtc_float.cpp new file mode 100644 index 000000000..26d667b97 --- /dev/null +++ b/test/test_bessel_i0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 7efe28c8339826b3144a094ef19347892dd3597a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 11:09:09 -0400 Subject: [PATCH 04/61] Add GPU support to bessel_i1 --- .../special_functions/detail/bessel_i0.hpp | 1 + .../special_functions/detail/bessel_i1.hpp | 73 ++++++++++--------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_i0.hpp b/include/boost/math/special_functions/detail/bessel_i0.hpp index 9f14fad44..f2219cc94 100644 --- a/include/boost/math/special_functions/detail/bessel_i0.hpp +++ b/include/boost/math/special_functions/detail/bessel_i0.hpp @@ -1,5 +1,6 @@ // Copyright (c) 2006 Xiaogang Zhang // Copyright (c) 2017 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) diff --git a/include/boost/math/special_functions/detail/bessel_i1.hpp b/include/boost/math/special_functions/detail/bessel_i1.hpp index badc35de0..d2c750df0 100644 --- a/include/boost/math/special_functions/detail/bessel_i1.hpp +++ b/include/boost/math/special_functions/detail/bessel_i1.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2017 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -17,9 +18,13 @@ #pragma once #endif +#include #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -38,24 +43,24 @@ namespace boost { namespace math { namespace detail{ template -T bessel_i1(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x); template -T bessel_i1_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) { //Max error in interpolated form : 1.348e-08 // Max Error found at float precision = Poly : 1.469121e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 8.333333221e-02f, 6.944453712e-03f, 3.472097211e-04f, @@ -74,7 +79,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 9.000e-08 // Max Error found at float precision = Poly: 1.044345e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942115977513013e-01f, -1.49581264836620262e-01f, -4.76475741878486795e-02f, @@ -89,7 +94,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -98,7 +103,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 5.639e-17 // Max Error found at double precision = Poly: 1.795559e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 8.333333333333333803e-02, 6.944444444444341983e-03, 3.472222222225921045e-04, @@ -122,7 +127,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.796e-16 // Max Error found at double precision = Poly: 2.898731e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.989422804014406054e-01, -1.496033551613111533e-01, -4.675104253598537322e-02, @@ -152,7 +157,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 1.320e-19 // Max Error found at double precision = Poly: 7.065357e-17 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.989422804014314820e-01, -1.496033551467584157e-01, -4.675105322571775911e-02, @@ -167,7 +172,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -175,7 +180,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[10 ^ -16, 7.75] // Max error in interpolated form: 8.086e-21 // Max Error found at float80 precision = Poly: 7.225090e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 8.33333333333333333340071817e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 6.94444444444444442462728070e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 3.47222222222222318886683883e-04), @@ -203,7 +208,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Deviation Found : 3.887e-20 // Expected Error Term : 3.887e-20 // Maximum Relative Change in Control Points : 1.681e-04 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942260530218897338680e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49599542849073670179540e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.70492865454119188276875e-02), @@ -236,7 +241,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 2.101e-03 // Max Error found at float80 precision = Poly : 6.029974e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401431675205845e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355149968887210170e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510486284376330257260e-02), @@ -258,7 +263,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[100, INF] // Max error in interpolated form: 2.456e-20 // Max Error found at float80 precision = Poly: 5.446356e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677958445e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355150537411254359e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510484842456251368526e-02), @@ -276,7 +281,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -285,7 +290,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.835e-35 // Max Error found at float128 precision = Poly: 1.645036e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.3333333333333333333333333333333331804098e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.9444444444444444444444444444445418303082e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.4722222222222222222222222222119082346591e-04), @@ -321,7 +326,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 5.204e-03 // Max Error found at float128 precision = Poly : 2.882561e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333333326889717360850080939e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444444511272790848815114507e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222222221892451965054394153443e-04), @@ -355,7 +360,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Deviation Found : 1.766e-35 // Expected Error Term : 1.021e-35 // Maximum Relative Change in Control Points : 6.228e-03 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333255774414858563409941233e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444897867884955912228700291e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222220954970397343617150959467e-04), @@ -389,7 +394,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 8.864e-36 // Max Error found at float128 precision = Poly: 8.522841e-35 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422793693152031514179994954750043e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496029423752889591425633234009799670e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.682975926820553021482820043377990241e-02), @@ -421,7 +426,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 6.028e-35 // Max Error found at float128 precision = Poly: 1.368313e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804012941975429616956496046931e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033550576049830976679315420681402e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.675107835141866009896710750800622147e-02), @@ -456,7 +461,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 5.494e-35 // Max Error found at float128 precision = Poly: 1.214651e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804014326779399307367861631577e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033551505372542086590873271571919e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.675104848454290286276466276677172664e-02), @@ -486,7 +491,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[100, INF] // Max error in interpolated form: 6.081e-35 // Max Error found at float128 precision = Poly: 1.407151e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438200208417e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.4960335515053725422747977247811372936584e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.6751048484542891946087411826356811991039e-02), @@ -512,33 +517,33 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_i1(const T& x) +inline BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; From a34dfe2a09be7fb1650a4da0f9d878f393ab01e5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 11:09:26 -0400 Subject: [PATCH 05/61] Add CUDA and NVRTC testing of bessel_i1 --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_i1_double.cu | 100 ++++++++++++++ test/test_bessel_i1_float.cu | 100 ++++++++++++++ test/test_bessel_i1_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_i1_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 test/test_bessel_i1_double.cu create mode 100644 test/test_bessel_i1_float.cu create mode 100644 test/test_bessel_i1_nvrtc_double.cpp create mode 100644 test/test_bessel_i1_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 9fdb4cd23..e8a0a2ce4 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -97,6 +97,8 @@ run test_beta_float.cu ; run test_bessel_i0_double.cu ; run test_bessel_i0_float.cu ; +run test_bessel_i1_double.cu ; +run test_bessel_i1_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 37c147f8f..2d76d9226 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -93,6 +93,8 @@ run test_beta_nvrtc_float.cpp ; run test_bessel_i0_nvrtc_double.cpp ; run test_bessel_i0_nvrtc_float.cpp ; +run test_bessel_i1_nvrtc_double.cpp ; +run test_bessel_i1_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_i1_double.cu b/test/test_bessel_i1_double.cu new file mode 100644 index 000000000..e4d6443a6 --- /dev/null +++ b/test/test_bessel_i1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i1_float.cu b/test/test_bessel_i1_float.cu new file mode 100644 index 000000000..12ae53542 --- /dev/null +++ b/test/test_bessel_i1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i1_nvrtc_double.cpp b/test/test_bessel_i1_nvrtc_double.cpp new file mode 100644 index 000000000..c270a6694 --- /dev/null +++ b/test/test_bessel_i1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_i1_nvrtc_float.cpp b/test/test_bessel_i1_nvrtc_float.cpp new file mode 100644 index 000000000..158c6a815 --- /dev/null +++ b/test/test_bessel_i1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4550ebc955e912963f69120d7b46c74cc2259bcc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 11:18:25 -0400 Subject: [PATCH 06/61] Add tgamma1pm1 NVRTC impl --- include/boost/math/special_functions/gamma.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index afb8e9728..be3bc504f 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2287,6 +2287,7 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t #else #include +#include namespace boost { namespace math { @@ -2309,6 +2310,19 @@ inline BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) return boost::math::lgamma(x); } +template +BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z) +{ + using namespace boost::math; + + if (fabs(z) < T(0.55)) + { + return expm1(lgamma(z)); + } + + return expm1(lgamma(1 + z)); +} + } // namespace math } // namespace boost From 590c3ef110e89c214b32e78ca15197e114297f58 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 12:11:51 -0400 Subject: [PATCH 07/61] Add GPU support to iconv --- include/boost/math/special_functions/detail/iconv.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/detail/iconv.hpp b/include/boost/math/special_functions/detail/iconv.hpp index 90b4aa938..79ac6e905 100644 --- a/include/boost/math/special_functions/detail/iconv.hpp +++ b/include/boost/math/special_functions/detail/iconv.hpp @@ -10,19 +10,19 @@ #pragma once #endif -#include +#include #include namespace boost { namespace math { namespace detail{ template -inline int iconv_imp(T v, Policy const&, std::true_type const&) +inline int iconv_imp(T v, Policy const&, boost::math::true_type const&) { return static_cast(v); } template -inline int iconv_imp(T v, Policy const& pol, std::false_type const&) +inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&) { BOOST_MATH_STD_USING return iround(v, pol); @@ -31,7 +31,7 @@ inline int iconv_imp(T v, Policy const& pol, std::false_type const&) template inline int iconv(T v, Policy const& pol) { - typedef typename std::is_convertible::type tag_type; + typedef typename boost::math::is_convertible::type tag_type; return iconv_imp(v, pol, tag_type()); } From c560fa906121b61f75062583cb43e31457438dd1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 12:23:44 -0400 Subject: [PATCH 08/61] Add GPU test to bessel_ik --- .../special_functions/detail/bessel_ik.hpp | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_ik.hpp b/include/boost/math/special_functions/detail/bessel_ik.hpp index 0c653b475..b3e7378fd 100644 --- a/include/boost/math/special_functions/detail/bessel_ik.hpp +++ b/include/boost/math/special_functions/detail/bessel_ik.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,14 +11,17 @@ #pragma once #endif -#include -#include +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include // Modified Bessel functions of the first and second kind of fractional order @@ -30,13 +34,13 @@ struct cyl_bessel_i_small_z { typedef T result_type; - cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4) + BOOST_MATH_GPU_ENABLED cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4) { BOOST_MATH_STD_USING term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T result = term; ++k; @@ -52,7 +56,7 @@ struct cyl_bessel_i_small_z }; template -inline T bessel_i_small_z_series(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_i_small_z_series(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING T prefix; @@ -69,7 +73,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol) return prefix; cyl_bessel_i_small_z s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -80,7 +84,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol) // Calculate K(v, x) and K(v+1, x) by method analogous to // Temme, Journal of Computational Physics, vol 21, 343 (1976) template -int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) +BOOST_MATH_GPU_ENABLED int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) { T f, h, p, q, coef, sum, sum1, tolerance; T a, b, c, d, sigma, gamma1, gamma2; @@ -157,7 +161,7 @@ int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) // Evaluate continued fraction fv = I_(v+1) / I_v, derived from // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73 template -int CF1_ik(T v, T x, T* fv, const Policy& pol) +BOOST_MATH_GPU_ENABLED int CF1_ik(T v, T x, T* fv, const Policy& pol) { T C, D, f, a, b, delta, tiny, tolerance; unsigned long k; @@ -204,7 +208,7 @@ int CF1_ik(T v, T x, T* fv, const Policy& pol) // z1 / z0 = U(v+1.5, 2v+1, 2x) / U(v+0.5, 2v+1, 2x), see // Thompson and Barnett, Computer Physics Communications, vol 47, 245 (1987) template -int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol) +BOOST_MATH_GPU_ENABLED int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::constants; @@ -297,7 +301,7 @@ enum{ // Compute I(v, x) and K(v, x) simultaneously by Temme's method, see // Temme, Journal of Computational Physics, vol 19, 324 (1975) template -int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) +BOOST_MATH_GPU_ENABLED int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) { // Kv1 = K_(v+1), fv = I_(v+1) / I_v // Ku1 = K_(u+1), fu = I_(u+1) / I_u @@ -314,7 +318,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) using namespace boost::math::tools; using namespace boost::math::constants; - static const char* function = "boost::math::bessel_ik<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_ik<%1%>(%1%,%1%)"; if (v < 0) { @@ -329,7 +333,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) if (((kind & need_i) == 0) && (fabs(4 * v * v - 25) / (8 * x) < tools::forth_root_epsilon())) { // A&S 9.7.2 - Iv = std::numeric_limits::quiet_NaN(); // any value will do + Iv = boost::math::numeric_limits::quiet_NaN(); // any value will do T mu = 4 * v * v; T eight_z = 8 * x; Kv = 1 + (mu - 1) / eight_z + (mu - 1) * (mu - 9) / (2 * eight_z * eight_z) + (mu - 1) * (mu - 9) * (mu - 25) / (6 * eight_z * eight_z * eight_z); @@ -410,7 +414,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) } } else - Iv = std::numeric_limits::quiet_NaN(); // any value will do + Iv = boost::math::numeric_limits::quiet_NaN(); // any value will do } if (reflect) { From 42db57e61363446ffdacc000d397de84aa004622 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 12:24:01 -0400 Subject: [PATCH 09/61] Add SYCL testing of complete bessel_i --- test/sycl_jamfile | 1 + test/test_bessel_i.cpp | 14 ++++++++++++++ test/test_bessel_i.hpp | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index d0a458cce..5e6a14957 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -25,6 +25,7 @@ run test_saspoint5.cpp ; # Special Functions run pow_test.cpp ; run test_beta_simple.cpp ; +run test_bessel_i.cpp ; run test_cbrt.cpp ; run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp index 68dcab0a5..70aac91e4 100644 --- a/test/test_bessel_i.cpp +++ b/test/test_bessel_i.cpp @@ -3,7 +3,13 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif + #include "test_bessel_i.hpp" // @@ -82,7 +88,11 @@ void expected_results() "linux", // platform largest_type, // test type(s) ".*Random.*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 600, 200); + #else ".*", 400, 200); // test function + #endif add_expected_result( "GNU.*", // compiler @@ -111,7 +121,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 400, 200); + #else ".*", 20, 10); // test function + #endif // // Set error rates a little higher for real_concept - // now that we use a series approximation for small z diff --git a/test/test_bessel_i.hpp b/test/test_bessel_i.hpp index 2da559f32..aa4f6a4ea 100644 --- a/test/test_bessel_i.hpp +++ b/test/test_bessel_i.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "functor.hpp" @@ -180,7 +181,10 @@ void test_bessel(T, const char* name) // // Special cases for full coverage: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::cyl_bessel_i(T(-2.5), T(-2.5)), std::domain_error); + #endif + BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(0), T(0)), T(1)); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(10), T(0)), T(0)); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(-10), T(0)), T(0)); @@ -197,10 +201,12 @@ void test_bessel(T, const char* name) } } T tolerance = boost::math::tools::epsilon() * 100; +#ifndef SYCL_LANGUAGE_VERSION if ((boost::math::tools::digits() <= std::numeric_limits::digits) && (std::numeric_limits::max_exponent > 1000)) { BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_i(T(0.5), T(710)), SC_(3.3447452278080108123142599104927325061327359278058601201179e306), tolerance); } +#endif #if LDBL_MAX_EXP >= 11356 BOOST_IF_CONSTEXPR (std::numeric_limits::max_exponent >= 11356) { From 18d3aa24f92fa2817a8eb2053d7bc975d3688bb3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 13:17:20 -0400 Subject: [PATCH 10/61] Add GPU support to bessel_j0 --- .../special_functions/detail/bessel_j0.hpp | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_j0.hpp b/include/boost/math/special_functions/detail/bessel_j0.hpp index 9a0b26fe6..2df027b21 100644 --- a/include/boost/math/special_functions/detail/bessel_j0.hpp +++ b/include/boost/math/special_functions/detail/bessel_j0.hpp @@ -10,6 +10,7 @@ #pragma once #endif +#include #include #include #include @@ -32,10 +33,10 @@ namespace boost { namespace math { namespace detail{ template -T bessel_j0(T x); +BOOST_MATH_GPU_ENABLED T bessel_j0(T x); template -T bessel_j0(T x) +BOOST_MATH_GPU_ENABLED T bessel_j0(T x) { #ifdef BOOST_MATH_INSTRUMENT static bool b = false; @@ -48,7 +49,7 @@ T bessel_j0(T x) } #endif - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.1298668500990866786e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7282507878605942706e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.2140700423540120665e+08)), @@ -57,7 +58,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0344222815443188943e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2117036164593528341e-01)) }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.3883787996332290397e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.6328198300859648632e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3985097372263433271e+08)), @@ -66,7 +67,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8319397969392084011e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2254078161378989535e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -7.2879702464464618998e+03)), @@ -76,7 +77,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.4321196680624245801e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.8591703355916499363e+01)) }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.5783478026152301072e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4599102262586308984e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.4055062591169562211e+04)), @@ -86,7 +87,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.5258076240801555057e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)), @@ -94,7 +95,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)) }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)), @@ -102,7 +103,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)), @@ -110,7 +111,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)) }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)), @@ -118,12 +119,13 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)), - x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)), - x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)), - x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)), - x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)), - x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04)); + + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)); + BOOST_MATH_STATIC const T x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)); + BOOST_MATH_STATIC const T x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)); + BOOST_MATH_STATIC const T x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)); + BOOST_MATH_STATIC const T x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)); + BOOST_MATH_STATIC const T x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04)); T value, factor, r, rc, rs; From 07e2e2aba8b4e9175d51c23c0d569cfb4cb83bee Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 13:22:12 -0400 Subject: [PATCH 11/61] Ignore BOOST_MATH_INSTRUMENT_VARIABLE on NVRTC --- include/boost/math/tools/config.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 3e5ba5ac3..e1ba2c344 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -829,6 +829,8 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; # define BOOST_MATH_INLINE_CONSTEXPR constexpr #endif +#define BOOST_MATH_INSTRUMENT_VARIABLE(x) + #endif // NVRTC #endif // BOOST_MATH_TOOLS_CONFIG_HPP From 37b49a2bd5ee6f46a523aabc592d7a1bf6157e45 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 13:22:30 -0400 Subject: [PATCH 12/61] Add bessel J0 CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_j0_double.cu | 100 ++++++++++++++ test/test_bessel_j0_float.cu | 100 ++++++++++++++ test/test_bessel_j0_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_j0_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 test/test_bessel_j0_double.cu create mode 100644 test/test_bessel_j0_float.cu create mode 100644 test/test_bessel_j0_nvrtc_double.cpp create mode 100644 test/test_bessel_j0_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index e8a0a2ce4..9404c8a89 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -99,6 +99,8 @@ run test_bessel_i0_double.cu ; run test_bessel_i0_float.cu ; run test_bessel_i1_double.cu ; run test_bessel_i1_float.cu ; +run test_bessel_j0_double.cu ; +run test_bessel_j0_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 2d76d9226..be2308153 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -95,6 +95,8 @@ run test_bessel_i0_nvrtc_double.cpp ; run test_bessel_i0_nvrtc_float.cpp ; run test_bessel_i1_nvrtc_double.cpp ; run test_bessel_i1_nvrtc_float.cpp ; +run test_bessel_j0_nvrtc_double.cpp ; +run test_bessel_j0_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_j0_double.cu b/test/test_bessel_j0_double.cu new file mode 100644 index 000000000..d32474d96 --- /dev/null +++ b/test/test_bessel_j0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j0_float.cu b/test/test_bessel_j0_float.cu new file mode 100644 index 000000000..48c6b9e39 --- /dev/null +++ b/test/test_bessel_j0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j0_nvrtc_double.cpp b/test/test_bessel_j0_nvrtc_double.cpp new file mode 100644 index 000000000..8c8b79841 --- /dev/null +++ b/test/test_bessel_j0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j0_nvrtc_float.cpp b/test/test_bessel_j0_nvrtc_float.cpp new file mode 100644 index 000000000..4a54b1eaa --- /dev/null +++ b/test/test_bessel_j0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 8bc0d1351b4312b6e6366ec7cbc17db4b9b47620 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 13:42:41 -0400 Subject: [PATCH 13/61] Add GPU support to bessel_j1 --- .../special_functions/detail/bessel_j1.hpp | 46 ++++++++++--------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_j1.hpp b/include/boost/math/special_functions/detail/bessel_j1.hpp index 6d354dcce..43df9fa0c 100644 --- a/include/boost/math/special_functions/detail/bessel_j1.hpp +++ b/include/boost/math/special_functions/detail/bessel_j1.hpp @@ -10,6 +10,7 @@ #pragma once #endif +#include #include #include #include @@ -32,27 +33,29 @@ namespace boost { namespace math{ namespace detail{ template -T bessel_j1(T x); +BOOST_MATH_GPU_ENABLED T bessel_j1(T x); template struct bessel_j1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(); } - static void do_init() + BOOST_MATH_GPU_ENABLED static void do_init() { bessel_j1(T(1)); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -60,11 +63,11 @@ template const typename bessel_j1_initializer::init bessel_j1_initializer::initializer; template -T bessel_j1(T x) +BOOST_MATH_GPU_ENABLED T bessel_j1(T x) { bessel_j1_initializer::force_instantiate(); - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4258509801366645672e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6781041261492395835e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1548696764841276794e+08)), @@ -73,7 +76,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0650724020080236441e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.0767857011487300348e-02)) }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1868604460820175290e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.2091902282580133541e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0228375140097033958e+08)), @@ -82,7 +85,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7527881995806511112e+16)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.6608531731299018674e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.6658018905416665164e+13)), @@ -92,7 +95,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -7.5023342220781607561e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.6179191852758252278e+00)) }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7253905888447681194e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7128800897135812012e+16)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.4899346165481429307e+13)), @@ -102,7 +105,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3886978985861357615e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)), @@ -111,7 +114,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)), @@ -120,7 +123,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)), @@ -129,7 +132,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)), @@ -138,12 +141,13 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)), - x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)), - x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)), - x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)), - x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)), - x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05)); + + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)); + BOOST_MATH_STATIC const T x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)); + BOOST_MATH_STATIC const T x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)); + BOOST_MATH_STATIC const T x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)); + BOOST_MATH_STATIC const T x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)); + BOOST_MATH_STATIC const T x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05)); T value, factor, r, rc, rs, w; From 09c4668a34538cb7d64104c17768cf8c181c33cf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 13:43:02 -0400 Subject: [PATCH 14/61] Add bessel j1 CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_j1_double.cu | 100 ++++++++++++++ test/test_bessel_j1_float.cu | 100 ++++++++++++++ test/test_bessel_j1_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_j1_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 test/test_bessel_j1_double.cu create mode 100644 test/test_bessel_j1_float.cu create mode 100644 test/test_bessel_j1_nvrtc_double.cpp create mode 100644 test/test_bessel_j1_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 9404c8a89..aca357646 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -101,6 +101,8 @@ run test_bessel_i1_double.cu ; run test_bessel_i1_float.cu ; run test_bessel_j0_double.cu ; run test_bessel_j0_float.cu ; +run test_bessel_j1_double.cu ; +run test_bessel_j1_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index be2308153..fb1a43b12 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -97,6 +97,8 @@ run test_bessel_i1_nvrtc_double.cpp ; run test_bessel_i1_nvrtc_float.cpp ; run test_bessel_j0_nvrtc_double.cpp ; run test_bessel_j0_nvrtc_float.cpp ; +run test_bessel_j1_nvrtc_double.cpp ; +run test_bessel_j1_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_j1_double.cu b/test/test_bessel_j1_double.cu new file mode 100644 index 000000000..33a6e71b6 --- /dev/null +++ b/test/test_bessel_j1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j1_float.cu b/test/test_bessel_j1_float.cu new file mode 100644 index 000000000..14dd37be3 --- /dev/null +++ b/test/test_bessel_j1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j1_nvrtc_double.cpp b/test/test_bessel_j1_nvrtc_double.cpp new file mode 100644 index 000000000..11460c11d --- /dev/null +++ b/test/test_bessel_j1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j1_nvrtc_float.cpp b/test/test_bessel_j1_nvrtc_float.cpp new file mode 100644 index 000000000..8f7cc6e3f --- /dev/null +++ b/test/test_bessel_j1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 46f322144185c16f414e62443d10ec22e4e1eec3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:28:08 -0400 Subject: [PATCH 15/61] Add GPU support to bessel jn and jy --- .../special_functions/detail/bessel_jn.hpp | 6 ++++- .../special_functions/detail/bessel_jy.hpp | 17 ++++++------ .../detail/bessel_jy_asym.hpp | 18 +++++++------ .../detail/bessel_jy_series.hpp | 27 +++++++++---------- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_jn.hpp b/include/boost/math/special_functions/detail/bessel_jn.hpp index a08af0548..73bc0c562 100644 --- a/include/boost/math/special_functions/detail/bessel_jn.hpp +++ b/include/boost/math/special_functions/detail/bessel_jn.hpp @@ -10,6 +10,10 @@ #pragma once #endif +#include +#include +#include +#include #include #include #include @@ -24,7 +28,7 @@ namespace boost { namespace math { namespace detail{ template -T bessel_jn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_jn(int n, T x, const Policy& pol) { T value(0), factor, current, prev, next; diff --git a/include/boost/math/special_functions/detail/bessel_jy.hpp b/include/boost/math/special_functions/detail/bessel_jy.hpp index 90e099eb7..d43f3050c 100644 --- a/include/boost/math/special_functions/detail/bessel_jy.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy.hpp @@ -20,7 +20,6 @@ #include #include #include -#include // Bessel functions of the first and second kind of fractional order @@ -38,7 +37,7 @@ namespace boost { namespace math { // try it and see... // template - bool hankel_PQ(T v, T x, T* p, T* q, const Policy& ) + BOOST_MATH_GPU_ENABLED bool hankel_PQ(T v, T x, T* p, T* q, const Policy& ) { BOOST_MATH_STD_USING T tolerance = 2 * policies::get_epsilon(); @@ -70,7 +69,7 @@ namespace boost { namespace math { // Calculate Y(v, x) and Y(v+1, x) by Temme's method, see // Temme, Journal of Computational Physics, vol 21, 343 (1976) template - int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol) + BOOST_MATH_GPU_ENABLED int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol) { T g, h, p, q, f, coef, sum, sum1, tolerance; T a, d, e, sigma; @@ -139,7 +138,7 @@ namespace boost { namespace math { // Evaluate continued fraction fv = J_(v+1) / J_v, see // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73 template - int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol) + BOOST_MATH_GPU_ENABLED int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol) { T C, D, f, a, b, delta, tiny, tolerance; unsigned long k; @@ -185,7 +184,7 @@ namespace boost { namespace math { // real values only. // template - int CF2_jy(T v, T x, T* p, T* q, const Policy& pol) + BOOST_MATH_GPU_ENABLED int CF2_jy(T v, T x, T* p, T* q, const Policy& pol) { BOOST_MATH_STD_USING @@ -254,13 +253,13 @@ namespace boost { namespace math { return 0; } - static const int need_j = 1; - static const int need_y = 2; + BOOST_MATH_STATIC const int need_j = 1; + BOOST_MATH_STATIC const int need_y = 2; // Compute J(v, x) and Y(v, x) simultaneously by Steed's method, see // Barnett et al, Computer Physics Communications, vol 8, 377 (1974) template - int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol) + BOOST_MATH_GPU_ENABLED int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol) { BOOST_MATH_ASSERT(x >= 0); @@ -273,7 +272,7 @@ namespace boost { namespace math { T cp = 0; T sp = 0; - static const char* function = "boost::math::bessel_jy<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_jy<%1%>(%1%,%1%)"; BOOST_MATH_STD_USING using namespace boost::math::tools; diff --git a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp index cb09b202d..4bb11c2ff 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp @@ -16,12 +16,14 @@ #pragma once #endif +#include +#include #include namespace boost{ namespace math{ namespace detail{ template -inline T asymptotic_bessel_amplitude(T v, T x) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_amplitude(T v, T x) { // Calculate the amplitude of J(v, x) and Y(v, x) for large // x: see A&S 9.2.28. @@ -39,7 +41,7 @@ inline T asymptotic_bessel_amplitude(T v, T x) } template -T asymptotic_bessel_phase_mx(T v, T x) +BOOST_MATH_GPU_ENABLED T asymptotic_bessel_phase_mx(T v, T x) { // // Calculate the phase of J(v, x) and Y(v, x) for large x. @@ -63,7 +65,7 @@ T asymptotic_bessel_phase_mx(T v, T x) } template -inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) { // See A&S 9.2.19. BOOST_MATH_STD_USING @@ -93,7 +95,7 @@ inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) } template -inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) { // See A&S 9.2.19. BOOST_MATH_STD_USING @@ -124,7 +126,7 @@ inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) } template -inline bool asymptotic_bessel_large_x_limit(int v, const T& x) +BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(int v, const T& x) { BOOST_MATH_STD_USING // @@ -142,7 +144,7 @@ inline bool asymptotic_bessel_large_x_limit(int v, const T& x) } template -inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) +BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) { BOOST_MATH_STD_USING // @@ -159,7 +161,7 @@ inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) } template -void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) +BOOST_MATH_GPU_ENABLED void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) { T c = 1; T p = (v / boost::math::sin_pi(v, pol)) * pow(x / 2, -v) / boost::math::tgamma(1 - v, pol); @@ -193,7 +195,7 @@ void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) } template -T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names T s = 1; diff --git a/include/boost/math/special_functions/detail/bessel_jy_series.hpp b/include/boost/math/special_functions/detail/bessel_jy_series.hpp index db46f3640..5c083f348 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_series.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_series.hpp @@ -10,10 +10,9 @@ #pragma once #endif -#include -#include #include #include +#include namespace boost { namespace math { namespace detail{ @@ -22,7 +21,7 @@ struct bessel_j_small_z_series_term { typedef T result_type; - bessel_j_small_z_series_term(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_j_small_z_series_term(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -30,7 +29,7 @@ struct bessel_j_small_z_series_term mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; @@ -49,7 +48,7 @@ struct bessel_j_small_z_series_term // Converges rapidly for all z << v. // template -inline T bessel_j_small_z_series(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_j_small_z_series(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING T prefix; @@ -66,7 +65,7 @@ inline T bessel_j_small_z_series(T v, T x, const Policy& pol) return prefix; bessel_j_small_z_series_term s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -79,7 +78,7 @@ struct bessel_y_small_z_series_term_a { typedef T result_type; - bessel_y_small_z_series_term_a(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_a(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -87,7 +86,7 @@ struct bessel_y_small_z_series_term_a mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { BOOST_MATH_STD_USING T r = term; @@ -107,7 +106,7 @@ struct bessel_y_small_z_series_term_b { typedef T result_type; - bessel_y_small_z_series_term_b(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_b(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -115,7 +114,7 @@ struct bessel_y_small_z_series_term_b mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; @@ -138,10 +137,10 @@ struct bessel_y_small_z_series_term_b // eps/2 * v^v(x/2)^-v > (x/2)^v or log(eps/2) > v log((x/2)^2/v) // template -inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "bessel_y_small_z_series<%1%>(%1%,%1%)"; + constexpr auto function = "bessel_y_small_z_series<%1%>(%1%,%1%)"; T prefix; T gam; T p = log(x / 2); @@ -183,7 +182,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) prefix = -exp(prefix); } bessel_y_small_z_series_term_a s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); *pscale = scale; T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -211,7 +210,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) } template -T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol) { // // See http://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/ From c700f7b7dc0135cc8146bb18ce175b9c8830cb23 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:28:22 -0400 Subject: [PATCH 16/61] Add SYCL bessel j testing --- test/sycl_jamfile | 1 + test/test_bessel_j.cpp | 5 +++++ test/test_bessel_j.hpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 5e6a14957..3075a8971 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -26,6 +26,7 @@ run test_saspoint5.cpp ; run pow_test.cpp ; run test_beta_simple.cpp ; run test_bessel_i.cpp ; +run test_bessel_j.cpp ; run test_cbrt.cpp ; run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp index 19a5f7426..516e34c29 100644 --- a/test/test_bessel_j.cpp +++ b/test/test_bessel_j.cpp @@ -3,7 +3,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #include "test_bessel_j.hpp" diff --git a/test/test_bessel_j.hpp b/test/test_bessel_j.hpp index 82106213e..c0b719ad8 100644 --- a/test/test_bessel_j.hpp +++ b/test/test_bessel_j.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -279,7 +280,9 @@ void test_bessel(T, const char* name) BOOST_MATH_CHECK_THROW(boost::math::sph_bessel(2, T(-2.0)), std::domain_error); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(0), T(2.5)), boost::math::cyl_bessel_j(T(0), T(-2.5))); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(1), T(2.5)), -boost::math::cyl_bessel_j(T(1), T(-2.5))); + #ifndef SYCL_LANGUAGE_VERSION BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_j(364, T(38.5)), SC_(1.793940496519190500748409872348034004417458734118663909894e-309), tolerance); + #endif // // Special cases at infinity: // From 6478bcc5c6d9a40b7f87100da4a47c48f8f6ce47 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:51:01 -0400 Subject: [PATCH 17/61] Add bessel_k0 GPU support --- .../detail/airy_ai_bi_zero.hpp | 48 +++++--- .../special_functions/detail/bessel_k0.hpp | 114 +++++++++--------- 2 files changed, 93 insertions(+), 69 deletions(-) diff --git a/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp b/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp index 7735eb858..e518422f1 100644 --- a/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp +++ b/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp @@ -13,6 +13,8 @@ #ifndef BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_ #define BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_ + #include + #include #include #include @@ -21,18 +23,18 @@ { // Forward declarations of the needed Airy function implementations. template - T airy_ai_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol); template - T airy_bi_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol); template - T airy_ai_prime_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol); template - T airy_bi_prime_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol); namespace airy_zero { template - T equation_as_10_4_105(const T& z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_as_10_4_105(const T& z, const Policy& pol) { const T one_over_z (T(1) / z); const T one_over_z_squared(one_over_z * one_over_z); @@ -54,7 +56,7 @@ namespace airy_ai_zero_detail { template - T initial_guess(const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol) { T guess; @@ -106,11 +108,19 @@ class function_object_ai_and_ai_prime { public: - explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { } + BOOST_MATH_GPU_ENABLED explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { } - function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default; + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default; + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif + + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Return a tuple containing both Ai(x) and Ai'(x). return boost::math::make_tuple( @@ -127,7 +137,7 @@ namespace airy_bi_zero_detail { template - T initial_guess(const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol) { T guess; @@ -179,11 +189,19 @@ class function_object_bi_and_bi_prime { public: - explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { } - - function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default; - - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { } + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif + + BOOST_MATH_GPU_ENABLED function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default; + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif + + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Return a tuple containing both Bi(x) and Bi'(x). return boost::math::make_tuple( diff --git a/include/boost/math/special_functions/detail/bessel_k0.hpp b/include/boost/math/special_functions/detail/bessel_k0.hpp index f29ffa75c..bab202b6c 100644 --- a/include/boost/math/special_functions/detail/bessel_k0.hpp +++ b/include/boost/math/special_functions/detail/bessel_k0.hpp @@ -13,10 +13,14 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include -#include #include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -44,35 +48,37 @@ namespace boost { namespace math { namespace detail{ template -T bessel_k0(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_k0(const T& x); template struct bessel_k0_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k0(T(0.5)); bessel_k0(T(1.5)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k0(T(0.5)); bessel_k0(T(1.5)); } template - static void do_init(const U&){} - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED static void do_init(const U&){} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -81,14 +87,14 @@ const typename bessel_k0_initializer::init bessel_k0_initializer template -T bessel_k0_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -97,14 +103,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -2.358e-09 // Maximum Relative Change in Control Points : 9.552e-02 // Max Error found at float precision = Poly : 4.448220e-08 - static const T Y = 1.137250900268554688f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554688f; + BOOST_MATH_STATIC const T P[] = { -1.372508979104259711e-01f, 2.622545986273687617e-01f, 5.047103728247919836e-03f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00f, -8.928694018000029415e-02f, @@ -117,7 +123,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -1.343e-09 // Maximum Relative Change in Control Points : 2.405e-02 // Max Error found at float precision = Poly : 1.354814e-07 - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { 1.159315158e-01f, 2.789828686e-01f, 2.524902861e-02f, @@ -133,14 +139,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 9.064e-02 // Max Error found at float precision = Poly : 5.065020e-08 - static const T P[] = + BOOST_MATH_STATIC const T P[] = { 2.533141220e-01f, 5.221502603e-01f, 6.380180669e-02f, -5.934976547e-02f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, 2.679722431e+00f, @@ -158,7 +164,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -167,8 +173,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -6.077e-17 // Maximum Relative Change in Control Points : 7.797e-02 // Max Error found at double precision = Poly : 1.003156e-16 - static const T Y = 1.137250900268554688; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554688; + BOOST_MATH_STATIC const T P[] = { -1.372509002685546267e-01, 2.574916117833312855e-01, @@ -176,7 +182,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) 5.445476986653926759e-04, 7.125159422136622118e-06 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00, -5.458333438017788530e-02, @@ -191,7 +197,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 3.392e-18 // Maximum Relative Change in Control Points : 2.041e-02 // Max Error found at double precision = Poly : 2.513112e-16 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { 1.159315156584124484e-01, 2.789828789146031732e-01, @@ -212,8 +218,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 2.757e-01 // Max Error found at double precision = Poly : 1.001560e-16 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { 2.533141373155002416e-01, 3.628342133984595192e+00, @@ -225,7 +231,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) -1.414237994269995877e+00, -9.369168119754924625e-02 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00, 1.494194694879908328e+01, @@ -248,7 +254,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -257,8 +263,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 2.180e-22 // Maximum Relative Change in Control Points : 2.943e-01 // Max Error found at float80 precision = Poly : 3.923207e-20 - static const T Y = 1.137250900268554687500e+00; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554687500e+00; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -1.372509002685546875002e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.566481981037407600436e-01), @@ -267,7 +273,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, 1.213747930378196492543e-05), BOOST_MATH_BIG_CONSTANT(T, 64, 9.423709328020389560844e-08) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -4.843828412587773008342e-02), @@ -284,7 +290,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -2.434e-21 // Maximum Relative Change in Control Points : 2.459e-02 // Max Error found at float80 precision = Poly : 1.482487e-19 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.159315156584124488110e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.764832791416047889734e-01), @@ -292,7 +298,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, 3.660777862036966089410e-04), BOOST_MATH_BIG_CONSTANT(T, 64, 2.094942446930673386849e-06) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -2.156100313881251616320e-02), @@ -308,8 +314,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 2.236e-21 // Maximum Relative Change in Control Points : 3.021e-01 //Max Error found at float80 precision = Poly : 8.727378e-20 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 2.533141373155002512056e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 5.417942070721928652715e+00), @@ -323,7 +329,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, -4.059789241612946683713e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -1.612783121537333908889e-01) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 2.200669254769325861404e+01), @@ -348,7 +354,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -357,8 +363,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 5.682e-37 // Maximum Relative Change in Control Points : 6.094e-04 // Max Error found at float128 precision = Poly : 5.338213e-35 - static const T Y = 1.137250900268554687500000000000000000e+00f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554687500000000000000000e+00f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -1.372509002685546875000000000000000006e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.556212905071072782462974351698081303e-01), @@ -369,7 +375,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 113, 1.752489221949580551692915881999762125e-09), BOOST_MATH_BIG_CONSTANT(T, 113, 5.243010555737173524710512824955368526e-12) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -4.095631064064621099785696980653193721e-02), @@ -387,7 +393,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 5.105e-38 // Maximum Relative Change in Control Points : 9.734e-03 // Max Error found at float128 precision = Poly : 1.688806e-34 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.159315156584124488107200313757741370e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.789828789146031122026800078439435369e-01), @@ -413,8 +419,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 4.917e-40 // Maximum Relative Change in Control Points : 3.385e-01 // Max Error found at float128 precision = Poly : 1.567573e-34 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 2.533141373155002512078826424055226265e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.001949740768235770078339977110749204e+01), @@ -439,7 +445,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 113, -4.201632288615609937883545928660649813e+03), BOOST_MATH_BIG_CONSTANT(T, 113, -3.690820607338480548346746717311811406e+01) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 7.964877874035741452203497983642653107e+01), @@ -475,33 +481,33 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_k0(const T& x) +BOOST_MATH_GPU_ENABLED inline T bessel_k0(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; From 7864cd4d40a8a9e398b702e4ddcd7bd75496114f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:51:17 -0400 Subject: [PATCH 18/61] Add bessel_k0 CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_k0_double.cu | 100 ++++++++++++++ test/test_bessel_k0_float.cu | 100 ++++++++++++++ test/test_bessel_k0_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_k0_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 test/test_bessel_k0_double.cu create mode 100644 test/test_bessel_k0_float.cu create mode 100644 test/test_bessel_k0_nvrtc_double.cpp create mode 100644 test/test_bessel_k0_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index aca357646..3fb8f57f7 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -103,6 +103,8 @@ run test_bessel_j0_double.cu ; run test_bessel_j0_float.cu ; run test_bessel_j1_double.cu ; run test_bessel_j1_float.cu ; +run test_bessel_k0_double.cu ; +run test_bessel_k0_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index fb1a43b12..5ae4694b4 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -99,6 +99,8 @@ run test_bessel_j0_nvrtc_double.cpp ; run test_bessel_j0_nvrtc_float.cpp ; run test_bessel_j1_nvrtc_double.cpp ; run test_bessel_j1_nvrtc_float.cpp ; +run test_bessel_k0_nvrtc_double.cpp ; +run test_bessel_k0_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_k0_double.cu b/test/test_bessel_k0_double.cu new file mode 100644 index 000000000..26d0e2bff --- /dev/null +++ b/test/test_bessel_k0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k0_float.cu b/test/test_bessel_k0_float.cu new file mode 100644 index 000000000..ffe59c25b --- /dev/null +++ b/test/test_bessel_k0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k0_nvrtc_double.cpp b/test/test_bessel_k0_nvrtc_double.cpp new file mode 100644 index 000000000..d41221212 --- /dev/null +++ b/test/test_bessel_k0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k0_nvrtc_float.cpp b/test/test_bessel_k0_nvrtc_float.cpp new file mode 100644 index 000000000..389fce21a --- /dev/null +++ b/test/test_bessel_k0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From c9dc68e5b8314be6de77eec3499dd1a7a846e988 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:59:35 -0400 Subject: [PATCH 19/61] Add GPU support to bessel_k1 --- .../special_functions/detail/bessel_k1.hpp | 126 +++++++++--------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_k1.hpp b/include/boost/math/special_functions/detail/bessel_k1.hpp index bd37f9021..49846dc8c 100644 --- a/include/boost/math/special_functions/detail/bessel_k1.hpp +++ b/include/boost/math/special_functions/detail/bessel_k1.hpp @@ -13,6 +13,10 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include #include @@ -44,36 +48,38 @@ namespace boost { namespace math { namespace detail{ template - T bessel_k1(const T&); + BOOST_MATH_GPU_ENABLED T bessel_k1(const T&); template struct bessel_k1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k1(T(0.5)); bessel_k1(T(2)); bessel_k1(T(6)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k1(T(0.5)); bessel_k1(T(6)); } template - static void do_init(const U&) {} - void force_instantiate()const {} + BOOST_MATH_GPU_ENABLED static void do_init(const U&) {} + BOOST_MATH_GPU_ENABLED void force_instantiate()const {} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -82,14 +88,14 @@ namespace boost { namespace math { namespace detail{ template - inline T bessel_k1_imp(const T&, const std::integral_constant&) + inline BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -98,14 +104,14 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.053e-12 // Maximum Relative Change in Control Points : 4.927e-02 // Max Error found at float precision = Poly : 7.918347e-10 - static const T Y = 8.695471287e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471287e-02f; + BOOST_MATH_STATIC const T P[] = { -3.621379531e-03f, 7.131781976e-03f, -1.535278300e-05f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, -5.173102701e-02f, @@ -118,7 +124,7 @@ namespace boost { namespace math { namespace detail{ // Maximum Deviation Found: 3.556e-08 // Expected Error Term : -3.541e-08 // Maximum Relative Change in Control Points : 8.203e-02 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { -3.079657469e-01f, -8.537108913e-02f, @@ -134,15 +140,15 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.227e-08 // Maximum Relative Change in Control Points : 9.917e-02 // Max Error found at float precision = Poly : 6.084411e-08 - static const T Y = 1.450342178f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.450342178f; + BOOST_MATH_STATIC const T P[] = { -1.970280088e-01f, 2.188747807e-02f, 7.270394756e-01f, 2.490678196e-01f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, 2.274292882e+00f, @@ -160,7 +166,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -169,15 +175,15 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 1.921e-17 // Maximum Relative Change in Control Points : 5.287e-03 // Max Error found at double precision = Poly : 2.004747e-17 - static const T Y = 8.69547128677368164e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.69547128677368164e-02f; + BOOST_MATH_STATIC const T P[] = { -3.62137953440350228e-03, 7.11842087490330300e-03, 1.00302560256614306e-05, 1.77231085381040811e-06 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.00000000000000000e+00, -4.80414794429043831e-02, @@ -193,14 +199,14 @@ namespace boost { namespace math { namespace detail{ // Maximum Relative Change in Control Points : 3.103e-04 // Max Error found at double precision = Poly : 1.246698e-16 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { -3.07965757829206184e-01, -7.80929703673074907e-02, -2.70619343754051620e-03, -2.49549522229072008e-05 }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { 1.00000000000000000e+00, -2.36316836412163098e-02, @@ -217,8 +223,8 @@ namespace boost { namespace math { namespace detail{ // Maximum Relative Change in Control Points : 2.786e-01 // Max Error found at double precision = Poly : 1.258798e-16 - static const T Y = 1.45034217834472656f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.45034217834472656f; + BOOST_MATH_STATIC const T P[] = { -1.97028041029226295e-01, -2.32408961548087617e+00, @@ -230,7 +236,7 @@ namespace boost { namespace math { namespace detail{ 6.62582288933739787e+00, 3.08851840645286691e-01 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.00000000000000000e+00, 1.41811409298826118e+01, @@ -253,7 +259,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -262,8 +268,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -5.548e-23 // Maximum Relative Change in Control Points : 2.002e-03 // Max Error found at float80 precision = Poly : 9.352785e-22 - static const T Y = 8.695471286773681640625e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471286773681640625e-02f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -3.621379534403483072861e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 7.102135866103952705932e-03), @@ -271,7 +277,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, 2.537484002571894870830e-06), BOOST_MATH_BIG_CONSTANT(T, 64, 6.603228256820000135990e-09) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -4.354457194045068370363e-02), @@ -287,7 +293,7 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 1.995e-23 // Maximum Relative Change in Control Points : 8.174e-04 // Max Error found at float80 precision = Poly : 4.137325e-20 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -3.079657578292062244054e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -7.963049154965966503231e-02), @@ -295,7 +301,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, -4.023052834702215699504e-05), BOOST_MATH_BIG_CONSTANT(T, 64, -1.719459155018493821839e-07) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -1.863917670410152669768e-02), @@ -312,8 +318,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.302e-21 // Maximum Relative Change in Control Points : 3.432e-01 // Max Error found at float80 precision = Poly : 1.083755e-19 - static const T Y = 1.450342178344726562500e+00f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.450342178344726562500e+00f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -1.970280410292263112917e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.058564803062959169322e+00), @@ -328,7 +334,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, 4.319614662598089438939e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 3.710715864316521856193e-02) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 2.298433045824439052398e+01), @@ -353,7 +359,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -362,8 +368,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -7.119e-35 // Maximum Relative Change in Control Points : 1.207e-03 // Max Error found at float128 precision = Poly : 7.143688e-35 - static const T Y = 8.695471286773681640625000000000000000e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471286773681640625000000000000000e-02f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -3.621379534403483072916666666666595475e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 7.074117676930975433219826471336547627e-03), @@ -373,7 +379,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 2.347140307321161346703214099534250263e-10), BOOST_MATH_BIG_CONSTANT(T, 113, 5.569608494081482873946791086435679661e-13) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -3.580768910152105375615558920428350204e-02), @@ -391,7 +397,7 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 4.473e-37 // Maximum Relative Change in Control Points : 8.550e-04 // Max Error found at float128 precision = Poly : 8.167701e-35 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -3.079657578292062244053600156878870690e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -8.133183745732467770755578848987414875e-02), @@ -401,7 +407,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, -1.632502325880313239698965376754406011e-09), BOOST_MATH_BIG_CONSTANT(T, 113, -2.311973065898784812266544485665624227e-12) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -1.311471216733781016657962995723287450e-02), @@ -418,8 +424,8 @@ namespace boost { namespace math { namespace detail{ { // Max error in interpolated form: 5.307e-37 // Max Error found at float128 precision = Poly: 7.087862e-35 - static const T Y = 1.5023040771484375f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.5023040771484375f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -2.489899398329369710528254347931380044e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -6.819080211203854781858815596508456873e+00), @@ -438,7 +444,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 1.039705646510167437971862966128055524e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 1.008418100718254816100425022904039530e-02) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 2.927456835239137986889227412815459529e+01), @@ -465,8 +471,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -6.565e-40 // Maximum Relative Change in Control Points : 1.880e-01 // Max Error found at float128 precision = Poly : 2.943572e-35 - static const T Y = 1.308816909790039062500000000000000000f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.308816909790039062500000000000000000f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -5.550277247453881129211735759447737350e-02), BOOST_MATH_BIG_CONSTANT(T, 113, -3.485883080219574328217554864956175929e+00), @@ -486,7 +492,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 8.981057433937398731355768088809437625e+05), BOOST_MATH_BIG_CONSTANT(T, 113, 2.519440069856232098711793483639792952e+04) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 7.127348248283623146544565916604103560e+01), @@ -517,33 +523,33 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } - template - inline T bessel_k1(const T& x) + template + inline BOOST_MATH_GPU_ENABLED T bessel_k1(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; From c48ec4508fdbdbf9659f1f3a79ea71d25831e048 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 14:59:55 -0400 Subject: [PATCH 20/61] Add bessel_k1 CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_k1_double.cu | 100 ++++++++++++++ test/test_bessel_k1_float.cu | 100 ++++++++++++++ test/test_bessel_k1_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++ test/test_bessel_k1_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 test/test_bessel_k1_double.cu create mode 100644 test/test_bessel_k1_float.cu create mode 100644 test/test_bessel_k1_nvrtc_double.cpp create mode 100644 test/test_bessel_k1_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 3fb8f57f7..28d5ac163 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -105,6 +105,8 @@ run test_bessel_j1_double.cu ; run test_bessel_j1_float.cu ; run test_bessel_k0_double.cu ; run test_bessel_k0_float.cu ; +run test_bessel_k1_double.cu ; +run test_bessel_k1_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 5ae4694b4..ab5b8a7bd 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -101,6 +101,8 @@ run test_bessel_j1_nvrtc_double.cpp ; run test_bessel_j1_nvrtc_float.cpp ; run test_bessel_k0_nvrtc_double.cpp ; run test_bessel_k0_nvrtc_float.cpp ; +run test_bessel_k1_nvrtc_double.cpp ; +run test_bessel_k1_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_k1_double.cu b/test/test_bessel_k1_double.cu new file mode 100644 index 000000000..ed1b353d9 --- /dev/null +++ b/test/test_bessel_k1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k1_float.cu b/test/test_bessel_k1_float.cu new file mode 100644 index 000000000..65fd802f2 --- /dev/null +++ b/test/test_bessel_k1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k1_nvrtc_double.cpp b/test/test_bessel_k1_nvrtc_double.cpp new file mode 100644 index 000000000..1e0f1e7f4 --- /dev/null +++ b/test/test_bessel_k1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k1_nvrtc_float.cpp b/test/test_bessel_k1_nvrtc_float.cpp new file mode 100644 index 000000000..1422a5886 --- /dev/null +++ b/test/test_bessel_k1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From e4d9a040451c5a01d67867af022ebd7e41ce8c6c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:10:51 -0400 Subject: [PATCH 21/61] Add GPU support to bessel_kn --- include/boost/math/special_functions/detail/bessel_kn.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_kn.hpp b/include/boost/math/special_functions/detail/bessel_kn.hpp index d0ddcd0db..41becc8aa 100644 --- a/include/boost/math/special_functions/detail/bessel_kn.hpp +++ b/include/boost/math/special_functions/detail/bessel_kn.hpp @@ -10,8 +10,12 @@ #pragma once #endif +#include +#include +#include #include #include +#include #include // Modified Bessel function of the second kind of integer order @@ -20,14 +24,14 @@ namespace boost { namespace math { namespace detail{ template -T bessel_kn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_kn(int n, T x, const Policy& pol) { BOOST_MATH_STD_USING T value, current, prev; using namespace boost::math::tools; - static const char* function = "boost::math::bessel_kn<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_kn<%1%>(%1%,%1%)"; if (x < 0) { From f0385658488f2e41a6390037912eb09b7332c2e3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:12:31 -0400 Subject: [PATCH 22/61] Add bessel_kn CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_bessel_kn_double.cu | 105 +++++++++++++++ test/test_bessel_kn_float.cu | 105 +++++++++++++++ test/test_bessel_kn_nvrtc_double.cpp | 192 +++++++++++++++++++++++++++ test/test_bessel_kn_nvrtc_float.cpp | 192 +++++++++++++++++++++++++++ 6 files changed, 598 insertions(+) create mode 100644 test/test_bessel_kn_double.cu create mode 100644 test/test_bessel_kn_float.cu create mode 100644 test/test_bessel_kn_nvrtc_double.cpp create mode 100644 test/test_bessel_kn_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 28d5ac163..98e6a49a6 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -107,6 +107,8 @@ run test_bessel_k0_double.cu ; run test_bessel_k0_float.cu ; run test_bessel_k1_double.cu ; run test_bessel_k1_float.cu ; +run test_bessel_kn_double.cu ; +run test_bessel_kn_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index ab5b8a7bd..07a89b2b6 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -103,6 +103,8 @@ run test_bessel_k0_nvrtc_double.cpp ; run test_bessel_k0_nvrtc_float.cpp ; run test_bessel_k1_nvrtc_double.cpp ; run test_bessel_k1_nvrtc_float.cpp ; +run test_bessel_kn_nvrtc_double.cpp ; +run test_bessel_kn_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_kn_double.cu b/test/test_bessel_kn_double.cu new file mode 100644 index 000000000..d15ba7304 --- /dev/null +++ b/test/test_bessel_kn_double.cu @@ -0,0 +1,105 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + boost::math::policies::policy<> pol; + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_kn_float.cu b/test/test_bessel_kn_float.cu new file mode 100644 index 000000000..d15ba7304 --- /dev/null +++ b/test/test_bessel_kn_float.cu @@ -0,0 +1,105 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + boost::math::policies::policy<> pol; + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_kn_nvrtc_double.cpp b/test/test_bessel_kn_nvrtc_double.cpp new file mode 100644 index 000000000..3b581f77c --- /dev/null +++ b/test/test_bessel_kn_nvrtc_double.cpp @@ -0,0 +1,192 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_kn_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_kn_nvrtc_float.cpp b/test/test_bessel_kn_nvrtc_float.cpp new file mode 100644 index 000000000..dcc987a70 --- /dev/null +++ b/test/test_bessel_kn_nvrtc_float.cpp @@ -0,0 +1,192 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_kn_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From cad503a6d51a9c5e1326e01e58abc0d9ac86fea8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:15:39 -0400 Subject: [PATCH 23/61] Add SYCL testing of complete bessel_k --- test/sycl_jamfile | 1 + test/test_bessel_k.cpp | 5 +++++ test/test_bessel_k.hpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 3075a8971..686f81dd6 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -27,6 +27,7 @@ run pow_test.cpp ; run test_beta_simple.cpp ; run test_bessel_i.cpp ; run test_bessel_j.cpp ; +run test_bessel_k.cpp ; run test_cbrt.cpp ; run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp index f0975b46d..d4ab7721f 100644 --- a/test/test_bessel_k.cpp +++ b/test/test_bessel_k.cpp @@ -5,7 +5,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #ifdef _MSC_VER # pragma warning(disable : 4756) // overflow in constant arithmetic diff --git a/test/test_bessel_k.hpp b/test/test_bessel_k.hpp index 22df3218f..6a2a8179d 100644 --- a/test/test_bessel_k.hpp +++ b/test/test_bessel_k.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "functor.hpp" @@ -175,6 +176,7 @@ void test_bessel(T, const char* name) // // Extra test coverage: // + #ifndef SYCL_LANGUAGE_VERSION // SYCL doesn't throw BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2.2), T(-1)), std::domain_error); BOOST_IF_CONSTEXPR(std::numeric_limits::has_infinity) @@ -194,6 +196,7 @@ void test_bessel(T, const char* name) BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1.25), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(1), T(0)), std::domain_error); + #endif } From bf543f98d8a65f5e1104e3383232e60582c209d8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:48:17 -0400 Subject: [PATCH 24/61] Make newton-rhapson GPU compatible --- include/boost/math/tools/roots.hpp | 93 +++++++++++++++++++----------- 1 file changed, 60 insertions(+), 33 deletions(-) diff --git a/include/boost/math/tools/roots.hpp b/include/boost/math/tools/roots.hpp index 8f36aa22d..d1614d7eb 100644 --- a/include/boost/math/tools/roots.hpp +++ b/include/boost/math/tools/roots.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,23 +12,19 @@ #endif #include - -#ifndef BOOST_MATH_HAS_NVRTC // Disabled for now - #include // test for multiprecision types in complex Newton - -#include -#include -#include -#include - -#include - +#include +#include +#include +#include #include #include -#include #include +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +#include +#endif + namespace boost { namespace math { namespace tools { @@ -37,11 +34,11 @@ namespace detail { namespace dummy { template - typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T); + BOOST_MATH_GPU_ENABLED typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T); } template -void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Use ADL to find the right overload for get: @@ -49,7 +46,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) b = get<1>(t); } template -void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Use ADL to find the right overload for get: @@ -59,7 +56,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) } template -inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Rely on ADL to find the correct overload of get: @@ -67,26 +64,30 @@ inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) } template -inline void unpack_tuple(const std::pair& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_tuple(const boost::math::pair& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T) { a = p.first; b = p.second; } template -inline void unpack_0(const std::pair& p, V& a) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_0(const boost::math::pair& p, V& a) BOOST_MATH_NOEXCEPT(T) { a = p.first; } template -void handle_zero_derivative(F f, +BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f, T& last_f0, const T& f0, T& delta, T& result, T& guess, const T& min, - const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) + const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) + #ifndef BOOST_MATH_HAS_GPU_ENABLED + && noexcept(std::declval()(std::declval())) + #endif + ) { if (last_f0 == 0) { @@ -132,25 +133,29 @@ void handle_zero_derivative(F f, } // namespace template -std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { T fmin = f(min); T fmax = f(max); if (fmin == 0) { max_iter = 2; - return std::make_pair(min, min); + return boost::math::make_pair(min, min); } if (fmax == 0) { max_iter = 2; - return std::make_pair(max, max); + return boost::math::make_pair(max, max); } // // Error checking: // - static const char* function = "boost::math::tools::bisect<%1%>"; + constexpr auto function = "boost::math::tools::bisect<%1%>"; if (min >= max) { return boost::math::detail::pair_from_single(policies::raise_evaluation_error(function, @@ -200,29 +205,41 @@ std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, con std::cout << "Bisection required " << max_iter << " iterations.\n"; #endif - return std::make_pair(min, max); + return boost::math::make_pair(min, max); } template -inline std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +inline boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { return bisect(f, min, max, tol, max_iter, policies::policy<>()); } template -inline std::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +inline boost::math::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { - std::uintmax_t m = (std::numeric_limits::max)(); + boost::math::uintmax_t m = (boost::math::numeric_limits::max)(); return bisect(f, min, max, tol, m, policies::policy<>()); } template -T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { BOOST_MATH_STD_USING - static const char* function = "boost::math::tools::newton_raphson_iterate<%1%>"; + constexpr auto function = "boost::math::tools::newton_raphson_iterate<%1%>"; if (min > max) { return policies::raise_evaluation_error(function, "Range arguments in wrong order in boost::math::tools::newton_raphson_iterate(first arg=%1%)", min, boost::math::policies::policy<>()); @@ -249,7 +266,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& T max_range_f = 0; T min_range_f = 0; - std::uintmax_t count(max_iter); + boost::math::uintmax_t count(max_iter); #ifdef BOOST_MATH_INSTRUMENT std::cout << "Newton_raphson_iterate, guess = " << guess << ", min = " << min << ", max = " << max @@ -336,12 +353,22 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& } template -inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { - std::uintmax_t m = (std::numeric_limits::max)(); + boost::math::uintmax_t m = (boost::math::numeric_limits::max)(); return newton_raphson_iterate(f, guess, min, max, digits, m); } +// TODO(mborland): Disabled for now +// Recursion needs to be removed, but there is no demand at this time +#ifdef BOOST_MATH_HAS_NVRTC +}}} // Namespaces +#else + namespace detail { struct halley_step From 6a326674260d62da014bafb0476170d0383d6040 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:48:42 -0400 Subject: [PATCH 25/61] Make the completed bessel functions GPU compatible --- .../boost/math/special_functions/bessel.hpp | 214 +++++++++++------- .../special_functions/detail/bessel_yn.hpp | 6 +- .../boost/math/special_functions/math_fwd.hpp | 64 +++--- 3 files changed, 162 insertions(+), 122 deletions(-) diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index e9677d3c7..3bba825da 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -15,8 +15,14 @@ # pragma once #endif -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -31,10 +37,11 @@ #include #include #include -#include -#include -#include -#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#endif #ifdef _MSC_VER # pragma warning(push) @@ -50,7 +57,7 @@ struct sph_bessel_j_small_z_series_term { typedef T result_type; - sph_bessel_j_small_z_series_term(unsigned v_, T x) + BOOST_MATH_GPU_ENABLED sph_bessel_j_small_z_series_term(unsigned v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -79,11 +86,11 @@ struct sph_bessel_j_small_z_series_term }; template -inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names sph_bessel_j_small_z_series_term s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -92,10 +99,22 @@ inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) } template -T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) +BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp_final(T v, T x, const bessel_no_int_tag& t, const Policy& pol) +{ + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; + + T result_J, y; // LCOV_EXCL_LINE + bessel_jy(v, x, &result_J, &y, need_j, pol); + return result_J; +} + +// Dispatch funtion to avoid recursion +template +BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::bessel_j<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; if(x < 0) { // better have integer v: @@ -105,23 +124,26 @@ T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) // This branch is hit by multiprecision types only, and is // tested by our real_concept tests, but thee are excluded from coverage // due to time constraints. - T r = cyl_bessel_j_imp(v, T(-x), t, pol); + T r = cyl_bessel_j_imp_final(T(v), T(-x), t, pol); if (iround(v, pol) & 1) + { r = -r; + } + return r; // LCOV_EXCL_STOP } else + { return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); + } } - T result_J, y; // LCOV_EXCL_LINE - bessel_jy(v, x, &result_J, &y, need_j, pol); - return result_J; + return cyl_bessel_j_imp_final(T(v), T(x), t, pol); } template -inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names. int ival = detail::iconv(v, pol); @@ -135,14 +157,14 @@ inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p } template -inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING return bessel_jn(v, x, pol); } template -inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names if(x < 0) @@ -171,7 +193,7 @@ inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) } template -T cyl_bessel_i_imp(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp_final(T v, T x, const Policy& pol) { // // This handles all the bessel I functions, note that we don't optimise @@ -180,20 +202,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) // case has better error handling too). // BOOST_MATH_STD_USING - static const char* function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; - if(x < 0) - { - // better have integer v: - if(floor(v) == v) - { - T r = cyl_bessel_i_imp(v, T(-x), pol); - if(iround(v, pol) & 1) - r = -r; - return r; - } - else - return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); - } + constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; if(x == 0) { if(v < 0) @@ -210,7 +219,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) } return sqrt(2 / (x * constants::pi())) * sinh(x); } - if((policies::digits() <= 113) && (std::numeric_limits::digits <= 113) && (std::numeric_limits::radix == 2)) + if((policies::digits() <= 113) && (boost::math::numeric_limits::digits <= 113) && (boost::math::numeric_limits::radix == 2)) { if(v == 0) { @@ -228,10 +237,39 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) return result_I; } +// Additional dispatch function to get the GPU impls happy +template +BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp(T v, T x, const Policy& pol) +{ + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; + + if(x < 0) + { + // better have integer v: + if(floor(v) == v) + { + T r = cyl_bessel_i_imp_final(T(v), T(-x), pol); + if(iround(v, pol) & 1) + { + r = -r; + } + + return r; + } + else + { + return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); + } + } + + return cyl_bessel_i_imp_final(T(v), T(x), pol); +} + template -inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol) { - static const char* function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)"; BOOST_MATH_STD_USING if(x < 0) { @@ -248,7 +286,7 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Poli } template -inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING if((floor(v) == v)) @@ -259,15 +297,15 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p } template -inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { return bessel_kn(v, x, pol); } template -inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) { - static const char* function = "boost::math::cyl_neumann<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::cyl_neumann<%1%>(%1%,%1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(v); BOOST_MATH_INSTRUMENT_VARIABLE(x); @@ -291,7 +329,7 @@ inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) } template -inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING @@ -310,16 +348,16 @@ inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& po } template -inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { return bessel_yn(v, x, pol); } template -inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names - static const char* function = "boost::math::sph_neumann<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::sph_neumann<%1%>(%1%,%1%)"; // // Nothing much to do here but check for errors, and // evaluate the function's definition directly: @@ -340,11 +378,11 @@ inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) } template -inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. - static const char* function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)"; + constexpr auto function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)"; const T half_epsilon(boost::math::tools::epsilon() / 2U); @@ -395,7 +433,7 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) const T guess_root = boost::math::detail::bessel_zero::cyl_bessel_j_zero_detail::initial_guess((order_is_integer ? vv : v), m, pol); // Select the maximum allowed iterations from the policy. - std::uintmax_t number_of_iterations = policies::get_max_root_iterations(); + boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations(); const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U)); @@ -418,11 +456,11 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) } template -inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. - static const char* function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)"; + constexpr auto function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)"; // Handle non-finite order. if (!(boost::math::isfinite)(v) ) @@ -473,7 +511,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) const T guess_root = boost::math::detail::bessel_zero::cyl_neumann_zero_detail::initial_guess(v, m, pol); // Select the maximum allowed iterations from the policy. - std::uintmax_t number_of_iterations = policies::get_max_root_iterations(); + boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations(); const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U)); @@ -498,7 +536,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) } // namespace detail template -inline typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -514,13 +552,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_j( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x) { return cyl_bessel_j(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -535,13 +573,13 @@ inline typename detail::bessel_traits::result_type sph_bessel(unsi } template -inline typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x) { return sph_bessel(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -556,13 +594,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_i( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x) { return cyl_bessel_i(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -578,13 +616,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_k( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x) { return cyl_bessel_k(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -600,13 +638,13 @@ inline typename detail::bessel_traits::result_type cyl_neumann(T } template -inline typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x) { return cyl_neumann(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -621,13 +659,13 @@ inline typename detail::bessel_traits::result_type sph_neumann(uns } template -inline typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x) { return sph_neumann(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -639,35 +677,35 @@ inline typename detail::bessel_traits::result_type cyl_bessel_j_ze policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return policies::checked_narrowing_cast(detail::cyl_bessel_j_zero_imp(v, m, forwarding_policy()), "boost::math::cyl_bessel_j_zero<%1%>(%1%,%1%)"); } template -inline typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return cyl_bessel_j_zero >(v, m, policies::policy<>()); } template -inline OutputIterator cyl_bessel_j_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy& pol) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); for(int i = 0; i < static_cast(number_of_zeros); ++i) @@ -679,7 +717,7 @@ inline OutputIterator cyl_bessel_j_zero(T v, } template -inline OutputIterator cyl_bessel_j_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it) @@ -688,7 +726,7 @@ inline OutputIterator cyl_bessel_j_zero(T v, } template -inline typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -700,35 +738,35 @@ inline typename detail::bessel_traits::result_type cyl_neumann_zer policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return policies::checked_narrowing_cast(detail::cyl_neumann_zero_imp(v, m, forwarding_policy()), "boost::math::cyl_neumann_zero<%1%>(%1%,%1%)"); } template -inline typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return cyl_neumann_zero >(v, m, policies::policy<>()); } template -inline OutputIterator cyl_neumann_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy& pol) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); for(int i = 0; i < static_cast(number_of_zeros); ++i) @@ -740,7 +778,7 @@ inline OutputIterator cyl_neumann_zero(T v, } template -inline OutputIterator cyl_neumann_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it) diff --git a/include/boost/math/special_functions/detail/bessel_yn.hpp b/include/boost/math/special_functions/detail/bessel_yn.hpp index 73dee0bbb..a45d1761c 100644 --- a/include/boost/math/special_functions/detail/bessel_yn.hpp +++ b/include/boost/math/special_functions/detail/bessel_yn.hpp @@ -10,9 +10,11 @@ #pragma once #endif +#include #include #include #include +#include #include // Bessel function of the second kind of integer order @@ -21,14 +23,14 @@ namespace boost { namespace math { namespace detail{ template -T bessel_yn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_yn(int n, T x, const Policy& pol) { BOOST_MATH_STD_USING T value, factor, current, prev; using namespace boost::math::tools; - static const char* function = "boost::math::bessel_yn<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_yn<%1%>(%1%,%1%)"; if ((x == 0) && (n == 0)) { diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 289b27592..230753b49 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -678,98 +678,98 @@ namespace boost // Bessel functions: template - typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_j_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x); template - typename detail::bessel_traits >::result_type sph_bessel_prime(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_bessel_prime(unsigned v, T x); template - typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_i_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_i_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_k_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_k_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_neumann_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x); template - typename detail::bessel_traits >::result_type sph_neumann_prime(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_neumann_prime(unsigned v, T x); template - typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m); template - OutputIterator cyl_bessel_j_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator cyl_bessel_j_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy&); template - typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m); template - OutputIterator cyl_neumann_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator cyl_neumann_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, From 40c39cac2099e0e53b9052a1cbcf3c037eed0a9b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 16 Aug 2024 16:48:55 -0400 Subject: [PATCH 26/61] Add SYCL bessel y testing --- test/sycl_jamfile | 1 + test/test_bessel_y.cpp | 9 +++++++++ test/test_bessel_y.hpp | 3 +++ 3 files changed, 13 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 686f81dd6..97c48474c 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -28,6 +28,7 @@ run test_beta_simple.cpp ; run test_bessel_i.cpp ; run test_bessel_j.cpp ; run test_bessel_k.cpp ; +run test_bessel_y.cpp ; run test_cbrt.cpp ; run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp index 83c24b95f..0bbefba55 100644 --- a/test/test_bessel_y.cpp +++ b/test/test_bessel_y.cpp @@ -3,7 +3,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #include "test_bessel_y.hpp" @@ -234,7 +239,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*(Y[nv]|y).*Random.*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 2000, 1000); + #else ".*", 1500, 1000); // test function + #endif // // Fallback for sun has to go after the general cases above: // diff --git a/test/test_bessel_y.hpp b/test/test_bessel_y.hpp index 28361a227..14b0be456 100644 --- a/test/test_bessel_y.hpp +++ b/test/test_bessel_y.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -241,10 +242,12 @@ void test_bessel(T, const char* name) BOOST_CHECK_EQUAL(boost::math::sph_neumann(2, std::numeric_limits::infinity()), T(0)); } + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0.2), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_neumann(T(2), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::sph_neumann(2, T(-2)), std::domain_error); + #endif #if LDBL_MAX_EXP > 1024 if (std::numeric_limits::max_exponent > 1024) { From 469a91d3974f20ee72c6d1cd235fd0b78cca0ecc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 14:45:38 -0400 Subject: [PATCH 27/61] Apply changes for non-empty policy on CUDA --- .../boost/math/special_functions/expm1.hpp | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/include/boost/math/special_functions/expm1.hpp b/include/boost/math/special_functions/expm1.hpp index b59721e93..5e61ca20b 100644 --- a/include/boost/math/special_functions/expm1.hpp +++ b/include/boost/math/special_functions/expm1.hpp @@ -15,9 +15,6 @@ #ifndef BOOST_MATH_HAS_NVRTC -#include -#include -#include #include #include #include @@ -25,6 +22,9 @@ #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -49,10 +49,10 @@ namespace detail { typedef T result_type; - expm1_series(T x) + BOOST_MATH_GPU_ENABLED expm1_series(T x) : k(0), m_x(x), m_term(1) {} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { ++k; m_term *= m_x; @@ -60,7 +60,7 @@ namespace detail return m_term; } - int count()const + BOOST_MATH_GPU_ENABLED int count()const { return k; } @@ -78,26 +78,28 @@ struct expm1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } template - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { expm1(T(0.5)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { expm1(T(0.5)); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -110,7 +112,7 @@ const typename expm1_initializer::init expm1_initializer |x| > epsilon. // template -T expm1_imp(T x, const std::integral_constant&, const Policy& pol) +T expm1_imp(T x, const boost::math::integral_constant&, const Policy& pol) { BOOST_MATH_STD_USING @@ -132,7 +134,7 @@ T expm1_imp(T x, const std::integral_constant&, const Policy& pol) if(a < tools::epsilon()) return x; detail::expm1_series s(x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = tools::sum_series(s, policies::get_epsilon(), max_iter); @@ -141,7 +143,7 @@ T expm1_imp(T x, const std::integral_constant&, const Policy& pol) } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -159,16 +161,16 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) if(a < tools::epsilon()) return x; - static const float Y = 0.10281276702880859e1f; - static const T n[] = { static_cast(-0.28127670288085937e-1), static_cast(0.51278186299064534e0), static_cast(-0.6310029069350198e-1), static_cast(0.11638457975729296e-1), static_cast(-0.52143390687521003e-3), static_cast(0.21491399776965688e-4) }; - static const T d[] = { 1, static_cast(-0.45442309511354755e0), static_cast(0.90850389570911714e-1), static_cast(-0.10088963629815502e-1), static_cast(0.63003407478692265e-3), static_cast(-0.17976570003654402e-4) }; + BOOST_MATH_STATIC const float Y = 0.10281276702880859e1f; + BOOST_MATH_STATIC const T n[] = { static_cast(-0.28127670288085937e-1), static_cast(0.51278186299064534e0), static_cast(-0.6310029069350198e-1), static_cast(0.11638457975729296e-1), static_cast(-0.52143390687521003e-3), static_cast(0.21491399776965688e-4) }; + BOOST_MATH_STATIC const T d[] = { 1, static_cast(-0.45442309511354755e0), static_cast(0.90850389570911714e-1), static_cast(-0.10088963629815502e-1), static_cast(0.63003407478692265e-3), static_cast(-0.17976570003654402e-4) }; T result = x * Y + x * tools::evaluate_polynomial(n, x) / tools::evaluate_polynomial(d, x); return result; } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -186,8 +188,8 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) if(a < tools::epsilon()) return x; - static const float Y = 0.10281276702880859375e1f; - static const T n[] = { + BOOST_MATH_STATIC const float Y = 0.10281276702880859375e1f; + BOOST_MATH_STATIC const T n[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.281276702880859375e-1), BOOST_MATH_BIG_CONSTANT(T, 64, 0.512980290285154286358e0), BOOST_MATH_BIG_CONSTANT(T, 64, -0.667758794592881019644e-1), @@ -196,7 +198,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) BOOST_MATH_BIG_CONSTANT(T, 64, 0.447441185192951335042e-4), BOOST_MATH_BIG_CONSTANT(T, 64, -0.714539134024984593011e-6) }; - static const T d[] = { + BOOST_MATH_STATIC const T d[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, -0.461477618025562520389e0), BOOST_MATH_BIG_CONSTANT(T, 64, 0.961237488025708540713e-1), @@ -211,7 +213,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -263,7 +265,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) } // namespace detail template -inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -275,7 +277,7 @@ inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant Date: Mon, 19 Aug 2024 15:08:00 -0400 Subject: [PATCH 28/61] Add NVCC cyl_bessel_i testing --- test/cuda_jamfile | 2 + test/test_cyl_bessel_i_double.cu | 104 +++++++++++++++++++++++++++++++ test/test_cyl_bessel_i_float.cu | 104 +++++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 test/test_cyl_bessel_i_double.cu create mode 100644 test/test_cyl_bessel_i_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 98e6a49a6..cd5a48a18 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -109,6 +109,8 @@ run test_bessel_k1_double.cu ; run test_bessel_k1_float.cu ; run test_bessel_kn_double.cu ; run test_bessel_kn_float.cu ; +run test_cyl_bessel_i_double.cu ; +run test_cyl_bessel_i_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_cyl_bessel_i_double.cu b/test/test_cyl_bessel_i_double.cu new file mode 100644 index 000000000..91a3ed8eb --- /dev/null +++ b/test/test_cyl_bessel_i_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_i_float.cu b/test/test_cyl_bessel_i_float.cu new file mode 100644 index 000000000..5aad1be88 --- /dev/null +++ b/test/test_cyl_bessel_i_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 60f7d64401aecce5b11b5c9606d0d5545a37621d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 16:07:08 -0400 Subject: [PATCH 29/61] Add GPU support to sinc --- .../boost/math/special_functions/math_fwd.hpp | 6 ++-- include/boost/math/special_functions/sinc.hpp | 29 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 230753b49..897cf8280 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -607,10 +607,10 @@ namespace boost // sinus cardinals: template - tools::promote_args_t sinc_pi(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sinc_pi(T x); template - tools::promote_args_t sinc_pi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sinc_pi(T x, const Policy&); template tools::promote_args_t sinhc_pi(T x); @@ -1487,7 +1487,7 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t sinhc_pi(T x){ return boost::math::sinhc_pi(x, Policy()); }\ diff --git a/include/boost/math/special_functions/sinc.hpp b/include/boost/math/special_functions/sinc.hpp index ff1b2e966..0c18ac346 100644 --- a/include/boost/math/special_functions/sinc.hpp +++ b/include/boost/math/special_functions/sinc.hpp @@ -17,13 +17,13 @@ #include #include +#include #include -#include #include -#include -#include -#include -#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#endif // These are the the "Sinus Cardinal" functions. @@ -36,7 +36,7 @@ namespace boost // This is the "Sinus Cardinal" of index Pi. template - inline T sinc_pi_imp(const T x) + BOOST_MATH_GPU_ENABLED inline T sinc_pi_imp(const T x) { BOOST_MATH_STD_USING @@ -44,7 +44,7 @@ namespace boost { return 0; } - else if (abs(x) >= 3.3 * tools::forth_root_epsilon()) + else if (abs(x) >= T(3.3) * tools::forth_root_epsilon()) { return(sin(x)/x); } @@ -58,24 +58,23 @@ namespace boost } // namespace detail template - inline typename tools::promote_args::type sinc_pi(T x) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sinc_pi(T x) { typedef typename tools::promote_args::type result_type; return detail::sinc_pi_imp(static_cast(x)); } template - inline typename tools::promote_args::type sinc_pi(T x, const Policy&) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sinc_pi(T x, const Policy&) { typedef typename tools::promote_args::type result_type; return detail::sinc_pi_imp(static_cast(x)); } template class U> - inline U sinc_pi(const U x) + BOOST_MATH_GPU_ENABLED inline U sinc_pi(const U x) { BOOST_MATH_STD_USING - using ::std::numeric_limits; T const taylor_0_bound = tools::epsilon(); T const taylor_2_bound = tools::root_epsilon(); @@ -88,11 +87,11 @@ namespace boost else { // approximation by taylor series in x at 0 up to order 0 -#ifdef __MWERKS__ + #ifdef __MWERKS__ U result = static_cast >(1); -#else + #else U result = U(1); -#endif + #endif if (abs(x) >= taylor_0_bound) { @@ -113,7 +112,7 @@ namespace boost } template class U, class Policy> - inline U sinc_pi(const U x, const Policy&) + BOOST_MATH_GPU_ENABLED inline U sinc_pi(const U x, const Policy&) { return sinc_pi(x); } From 9f43d94982ec5d0965b1bf4151216fb46d2e57ca Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 16:07:23 -0400 Subject: [PATCH 30/61] Add GPU support to series functions --- include/boost/math/tools/config.hpp | 1 + include/boost/math/tools/series.hpp | 70 ++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index e1ba2c344..9a090fdaa 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -798,6 +798,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING #define BOOST_MATH_IF_CONSTEXPR if constexpr +#define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) // This should be defined to nothing but since it is not specifically a math macro // we need to undef before proceeding diff --git a/include/boost/math/tools/series.hpp b/include/boost/math/tools/series.hpp index a4b5cc626..50f2828bb 100644 --- a/include/boost/math/tools/series.hpp +++ b/include/boost/math/tools/series.hpp @@ -10,10 +10,10 @@ #pragma once #endif -#include -#include -#include + #include +#include +#include namespace boost{ namespace math{ namespace tools{ @@ -21,13 +21,17 @@ namespace boost{ namespace math{ namespace tools{ // Simple series summation come first: // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type result = init_value; result_type next_term; @@ -44,14 +48,22 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { typename Functor::result_type init_value = 0; return sum_series(func, factor, max_terms, init_value); } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; @@ -60,17 +72,25 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t iters = (std::numeric_limits::max)(); + boost::math::uintmax_t iters = (boost::math::numeric_limits::max)(); result_type init_val = 0; return sum_series(func, bits, iters, init_val); } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; @@ -79,23 +99,31 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING - std::uintmax_t iters = (std::numeric_limits::max)(); + boost::math::uintmax_t iters = (boost::math::numeric_limits::max)(); return sum_series(func, bits, iters, init_value); } // // Checked summation: // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type result = init_value; result_type next_term; @@ -125,7 +153,11 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(F // in any case the result is still much better than a naive summation. // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING @@ -148,13 +180,17 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Fun } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type factor = ldexp(result_type(1), bits); result_type result = func(); From 9b7b6903c51fb5ed6f035b5211b2ace1e15f0690 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 16:56:00 -0400 Subject: [PATCH 31/61] Add GPU support to bessel_jy_zero --- .../detail/bessel_jy_zero.hpp | 78 ++++++++++--------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp index cb1fc48d8..2e3b8c6d8 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp @@ -18,19 +18,26 @@ #ifndef BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ #define BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ - #include + #include + #include + #include + #include + #include #include - #include #include #include + #ifndef BOOST_MATH_HAS_NVRTC + #include + #endif + namespace boost { namespace math { namespace detail { namespace bessel_zero { template - T equation_nist_10_21_19(const T& v, const T& a) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_19(const T& v, const T& a) { // Get the initial estimate of the m'th root of Jv or Yv. // This subroutine is used for the order m with m > 1. @@ -57,11 +64,11 @@ class equation_as_9_3_39_and_its_derivative { public: - explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { } + BOOST_MATH_GPU_ENABLED explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { } - equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default; + BOOST_MATH_GPU_ENABLED equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default; - boost::math::tuple operator()(const T& z) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& z) const { BOOST_MATH_STD_USING // ADL of std names, needed for acos, sqrt. @@ -86,7 +93,7 @@ }; template - static T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -132,9 +139,9 @@ // Select the maximum allowed iterations based on the number // of decimal digits in the numeric type T, being at least 12. - const auto iterations_allowed = static_cast((std::max)(12, my_digits10 * 2)); + const auto iterations_allowed = static_cast(BOOST_MATH_GPU_SAFE_MAX(12, my_digits10 * 2)); - std::uintmax_t iterations_used = iterations_allowed; + boost::math::uintmax_t iterations_used = iterations_allowed; // Calculate the root of z as a function of zeta. const T z = boost::math::tools::newton_raphson_iterate( @@ -142,7 +149,7 @@ z_estimate, range_zmin, range_zmax, - (std::min)(boost::math::tools::digits(), boost::math::tools::digits()), + BOOST_MATH_GPU_SAFE_MIN(boost::math::tools::digits(), boost::math::tools::digits()), iterations_used); static_cast(iterations_used); @@ -168,7 +175,7 @@ namespace cyl_bessel_j_zero_detail { template - T equation_nist_10_21_40_a(const T& v, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_a(const T& v, const Policy& pol) { const T v_pow_third(boost::math::cbrt(v, pol)); const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third)); @@ -185,13 +192,13 @@ class function_object_jv { public: - function_object_jv(const T& v, + BOOST_MATH_GPU_ENABLED function_object_jv(const T& v, const Policy& pol) : my_v(v), my_pol(pol) { } - function_object_jv(const function_object_jv&) = default; + BOOST_MATH_GPU_ENABLED function_object_jv(const function_object_jv&) = default; - T operator()(const T& x) const + BOOST_MATH_GPU_ENABLED T operator()(const T& x) const { return boost::math::cyl_bessel_j(my_v, x, my_pol); } @@ -206,15 +213,16 @@ class function_object_jv_and_jv_prime { public: - function_object_jv_and_jv_prime(const T& v, - const bool order_is_zero, - const Policy& pol) : my_v(v), + BOOST_MATH_GPU_ENABLED function_object_jv_and_jv_prime( + const T& v, + const bool order_is_zero, + const Policy& pol) : my_v(v), my_order_is_zero(order_is_zero), my_pol(pol) { } function_object_jv_and_jv_prime(const function_object_jv_and_jv_prime&) = default; - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Obtain Jv(x) and Jv'(x). // Chris's original code called the Bessel function implementation layer direct, @@ -246,10 +254,10 @@ const function_object_jv_and_jv_prime& operator=(const function_object_jv_and_jv_prime&) = delete; }; - template bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } + template BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } template - T initial_guess(const T& v, const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. @@ -325,7 +333,7 @@ } // Perform several steps of bisection iteration to refine the guess. - std::uintmax_t number_of_iterations(12U); + boost::math::uintmax_t number_of_iterations(12U); // Do the bisection iteration. const boost::math::tuple guess_pair = @@ -390,7 +398,7 @@ namespace cyl_neumann_zero_detail { template - T equation_nist_10_21_40_b(const T& v, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_b(const T& v, const Policy& pol) { const T v_pow_third(boost::math::cbrt(v, pol)); const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third)); @@ -407,13 +415,13 @@ class function_object_yv { public: - function_object_yv(const T& v, - const Policy& pol) : my_v(v), - my_pol(pol) { } + BOOST_MATH_GPU_ENABLED function_object_yv(const T& v, + const Policy& pol) : my_v(v), + my_pol(pol) { } - function_object_yv(const function_object_yv&) = default; + BOOST_MATH_GPU_ENABLED function_object_yv(const function_object_yv&) = default; - T operator()(const T& x) const + BOOST_MATH_GPU_ENABLED T operator()(const T& x) const { return boost::math::cyl_neumann(my_v, x, my_pol); } @@ -428,13 +436,13 @@ class function_object_yv_and_yv_prime { public: - function_object_yv_and_yv_prime(const T& v, - const Policy& pol) : my_v(v), - my_pol(pol) { } + BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const T& v, + const Policy& pol) : my_v(v), + my_pol(pol) { } - function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default; + BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default; - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { const T half_epsilon(boost::math::tools::epsilon() / 2U); @@ -469,10 +477,10 @@ const function_object_yv_and_yv_prime& operator=(const function_object_yv_and_yv_prime&) = delete; }; - template bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } + template BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } template - T initial_guess(const T& v, const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. @@ -560,7 +568,7 @@ } // Perform several steps of bisection iteration to refine the guess. - std::uintmax_t number_of_iterations(12U); + boost::math::uintmax_t number_of_iterations(12U); // Do the bisection iteration. const boost::math::tuple guess_pair = From 8c4208f5ed6e757bbdc891c0af193e4c777de5de Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 08:40:01 -0400 Subject: [PATCH 32/61] Add array helper type --- include/boost/math/tools/array.hpp | 41 +++++++++++++++++++++++++++++ include/boost/math/tools/config.hpp | 1 + 2 files changed, 42 insertions(+) create mode 100644 include/boost/math/tools/array.hpp diff --git a/include/boost/math/tools/array.hpp b/include/boost/math/tools/array.hpp new file mode 100644 index 000000000..23e666673 --- /dev/null +++ b/include/boost/math/tools/array.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2024 Matt Borland +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Regular use of std::array functions can not be used on +// GPU platforms like CUDA since they are missing the __device__ marker +// Alias as needed to get correct support + +#ifndef BOOST_MATH_TOOLS_ARRAY_HPP +#define BOOST_MATH_TOOLS_ARRAY_HPP + +#include + +#ifdef BOOST_MATH_ENABLE_CUDA + +#include + +namespace boost { +namespace math { + +using cuda::std::array; + +} // namespace math +} // namespace boost + +#else + +#include + +namespace boost { +namespace math { + +using std::array; + +} // namespace math +} // namespace boost + +#endif // BOOST_MATH_ENABLE_CUDA + +#endif // BOOST_MATH_TOOLS_ARRAY_HPP diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 9a090fdaa..82017a62b 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -676,6 +676,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT From 640285ab451db97334f64c95343ffcfedec5d632 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:13:59 -0400 Subject: [PATCH 33/61] Make hypot GPU safe --- .../boost/math/special_functions/hypot.hpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/hypot.hpp b/include/boost/math/special_functions/hypot.hpp index c56c75110..f38e37e87 100644 --- a/include/boost/math/special_functions/hypot.hpp +++ b/include/boost/math/special_functions/hypot.hpp @@ -12,20 +12,20 @@ #include #include +#include +#include #include #include -#include // for swap -#include namespace boost{ namespace math{ namespace detail{ template -T hypot_imp(T x, T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T hypot_imp(T x, T y, const Policy& pol) { // // Normalize x and y, so that both are positive and x >= y: // - using std::fabs; using std::sqrt; // ADL of std names + BOOST_MATH_STD_USING x = fabs(x); y = fabs(y); @@ -35,16 +35,16 @@ T hypot_imp(T x, T y, const Policy& pol) #pragma warning(disable: 4127) #endif // special case, see C99 Annex F: - if(std::numeric_limits::has_infinity - && ((x == std::numeric_limits::infinity()) - || (y == std::numeric_limits::infinity()))) + if(boost::math::numeric_limits::has_infinity + && ((x == boost::math::numeric_limits::infinity()) + || (y == boost::math::numeric_limits::infinity()))) return policies::raise_overflow_error("boost::math::hypot<%1%>(%1%,%1%)", nullptr, pol); #ifdef _MSC_VER #pragma warning(pop) #endif if(y > x) - (std::swap)(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(x * tools::epsilon() >= y) return x; @@ -56,7 +56,7 @@ T hypot_imp(T x, T y, const Policy& pol) } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hypot(T1 x, T2 y) { typedef typename tools::promote_args::type result_type; @@ -65,7 +65,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hypot(T1 x, T2 y, const Policy& pol) { typedef typename tools::promote_args::type result_type; From d2c1292261a4d931f0ed2c5c8b33d03e721c89d3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:43:36 -0400 Subject: [PATCH 34/61] Make bessel_yX GPU capable --- .../special_functions/detail/bessel_y0.hpp | 27 ++++++++++--------- .../special_functions/detail/bessel_y1.hpp | 23 ++++++++-------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_y0.hpp b/include/boost/math/special_functions/detail/bessel_y0.hpp index 1679820d1..f1aea6acb 100644 --- a/include/boost/math/special_functions/detail/bessel_y0.hpp +++ b/include/boost/math/special_functions/detail/bessel_y0.hpp @@ -12,6 +12,7 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include #include #include #include @@ -36,12 +37,12 @@ namespace boost { namespace math { namespace detail{ template -T bessel_y0(T x, const Policy&); +BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&); template -T bessel_y0(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&) { - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0723538782003176831e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.3716255451260504098e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0422274357376619816e+08)), @@ -49,7 +50,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0102532948020907590e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8402381979244993524e+01)), }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.8873865738997033405e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.1617187777290363573e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5662956624278251596e+07)), @@ -57,7 +58,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6475986689240190091e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.2213976967566192242e+13)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -5.5107435206722644429e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3600098638603061642e+10)), @@ -66,7 +67,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4566865832663635920e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7427031242901594547e+01)), }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3386146580707264428e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4266824419412347550e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4015103849971240096e+10)), @@ -75,7 +76,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.3030857612070288823e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P3[] = { + BOOST_MATH_STATIC const T P3[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.0728726905150210443e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.7016641869173237784e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2829912364088687306e+11)), @@ -85,7 +86,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1363534169313901632e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7439661319197499338e+01)), }; - static const T Q3[] = { + BOOST_MATH_STATIC const T Q3[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4563724628846457519e+17)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9272425569640309819e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2598377924042897629e+13)), @@ -95,7 +96,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.7903362168128450017e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)), @@ -103,7 +104,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)), }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)), @@ -111,7 +112,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)), @@ -119,7 +120,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)), }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)), @@ -127,7 +128,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)), + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)), x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9576784193148578684e+00)), x3 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0860510603017726976e+00)), x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.280e+02)), diff --git a/include/boost/math/special_functions/detail/bessel_y1.hpp b/include/boost/math/special_functions/detail/bessel_y1.hpp index 3ac696bb5..0f0dbdf3b 100644 --- a/include/boost/math/special_functions/detail/bessel_y1.hpp +++ b/include/boost/math/special_functions/detail/bessel_y1.hpp @@ -12,6 +12,7 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include #include #include #include @@ -36,12 +37,12 @@ namespace boost { namespace math { namespace detail{ template -T bessel_y1(T x, const Policy&); +BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&); template -T bessel_y1(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&) { - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.0535726612579544093e+13)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4708611716525426053e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.7595974497819597599e+11)), @@ -50,7 +51,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2157953222280260820e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.1714424660046133456e+02)), }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0737873921079286084e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1272286200406461981e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7800352738690585613e+10)), @@ -59,7 +60,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.2079908168393867438e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1514276357909013326e+19)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -5.6808094574724204577e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.3638408497043134724e+16)), @@ -70,7 +71,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.9153806858264202986e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2337180442012953128e+03)), }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.3321844313316185697e+20)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.6968198822857178911e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0837179548112881950e+16)), @@ -81,7 +82,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.2855164849321609336e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)), @@ -90,7 +91,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)), }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)), @@ -99,7 +100,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)), @@ -108,7 +109,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)), }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)), @@ -117,7 +118,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)), + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)), x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4296810407941351328e+00)), x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.620e+02)), x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8288260310170351490e-03)), From b2e35acb406faddb4066bff0ab718f939babdc87 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:43:55 -0400 Subject: [PATCH 35/61] Add bessel_y0 and bessel_y1 CUDA testing --- test/cuda_jamfile | 4 ++ test/test_bessel_y0_double.cu | 106 ++++++++++++++++++++++++++++++++++ test/test_bessel_y0_float.cu | 106 ++++++++++++++++++++++++++++++++++ test/test_bessel_y1_double.cu | 106 ++++++++++++++++++++++++++++++++++ test/test_bessel_y1_float.cu | 106 ++++++++++++++++++++++++++++++++++ 5 files changed, 428 insertions(+) create mode 100644 test/test_bessel_y0_double.cu create mode 100644 test/test_bessel_y0_float.cu create mode 100644 test/test_bessel_y1_double.cu create mode 100644 test/test_bessel_y1_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index cd5a48a18..aca93cb31 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -109,6 +109,10 @@ run test_bessel_k1_double.cu ; run test_bessel_k1_float.cu ; run test_bessel_kn_double.cu ; run test_bessel_kn_float.cu ; +run test_bessel_y0_double.cu ; +run test_bessel_y0_float.cu ; +run test_bessel_y1_double.cu ; +run test_bessel_y1_float.cu ; run test_cyl_bessel_i_double.cu ; run test_cyl_bessel_i_float.cu ; diff --git a/test/test_bessel_y0_double.cu b/test/test_bessel_y0_double.cu new file mode 100644 index 000000000..c8deada7d --- /dev/null +++ b/test/test_bessel_y0_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y0_float.cu b/test/test_bessel_y0_float.cu new file mode 100644 index 000000000..c8deada7d --- /dev/null +++ b/test/test_bessel_y0_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y1_double.cu b/test/test_bessel_y1_double.cu new file mode 100644 index 000000000..a5b3051b4 --- /dev/null +++ b/test/test_bessel_y1_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y1_float.cu b/test/test_bessel_y1_float.cu new file mode 100644 index 000000000..532aaf328 --- /dev/null +++ b/test/test_bessel_y1_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 5a1d32121af7f412ddc0b59a7df583a5d2174d22 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:52:57 -0400 Subject: [PATCH 36/61] Add nvrtc testing of bessel_y0 and bessel_y1 --- test/nvrtc_jamfile | 6 + test/test_bessel_y0_nvrtc_double.cpp | 194 +++++++++++++++++++++++++++ test/test_bessel_y0_nvrtc_float.cpp | 194 +++++++++++++++++++++++++++ test/test_bessel_y1_nvrtc_double.cpp | 194 +++++++++++++++++++++++++++ test/test_bessel_y1_nvrtc_float.cpp | 194 +++++++++++++++++++++++++++ 5 files changed, 782 insertions(+) create mode 100644 test/test_bessel_y0_nvrtc_double.cpp create mode 100644 test/test_bessel_y0_nvrtc_float.cpp create mode 100644 test/test_bessel_y1_nvrtc_double.cpp create mode 100644 test/test_bessel_y1_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 07a89b2b6..1da64aedc 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -105,6 +105,12 @@ run test_bessel_k1_nvrtc_double.cpp ; run test_bessel_k1_nvrtc_float.cpp ; run test_bessel_kn_nvrtc_double.cpp ; run test_bessel_kn_nvrtc_float.cpp ; +run test_bessel_y0_nvrtc_double.cpp ; +run test_bessel_y0_nvrtc_float.cpp ; +run test_bessel_y1_nvrtc_double.cpp ; +run test_bessel_y1_nvrtc_float.cpp ; +# run test_cyl_bessel_i_nvrtc_double.cpp ; +# run test_cyl_bessel_i_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bessel_y0_nvrtc_double.cpp b/test/test_bessel_y0_nvrtc_double.cpp new file mode 100644 index 000000000..8645a0fdd --- /dev/null +++ b/test/test_bessel_y0_nvrtc_double.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y0(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y0_nvrtc_float.cpp b/test/test_bessel_y0_nvrtc_float.cpp new file mode 100644 index 000000000..75a065bd6 --- /dev/null +++ b/test/test_bessel_y0_nvrtc_float.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y0(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y1_nvrtc_double.cpp b/test/test_bessel_y1_nvrtc_double.cpp new file mode 100644 index 000000000..383d879eb --- /dev/null +++ b/test/test_bessel_y1_nvrtc_double.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y1(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y1_nvrtc_float.cpp b/test/test_bessel_y1_nvrtc_float.cpp new file mode 100644 index 000000000..c2c1355e6 --- /dev/null +++ b/test/test_bessel_y1_nvrtc_float.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y1(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 23dc85d0dad758fdae00180481e9b75c27ed5c96 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:53:23 -0400 Subject: [PATCH 37/61] Fix macros --- include/boost/math/tools/config.hpp | 4 +++- include/boost/math/tools/roots.hpp | 15 ++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 82017a62b..fda4029d6 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -800,6 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_STD_USING #define BOOST_MATH_IF_CONSTEXPR if constexpr #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) +#define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr // This should be defined to nothing but since it is not specifically a math macro // we need to undef before proceeding @@ -831,7 +832,8 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; # define BOOST_MATH_INLINE_CONSTEXPR constexpr #endif -#define BOOST_MATH_INSTRUMENT_VARIABLE(x) +#define BOOST_MATH_INSTRUMENT_VARIABLE(x) +#define BOOST_MATH_INSTRUMENT_CODE(x) #endif // NVRTC diff --git a/include/boost/math/tools/roots.hpp b/include/boost/math/tools/roots.hpp index d1614d7eb..b0b0fc246 100644 --- a/include/boost/math/tools/roots.hpp +++ b/include/boost/math/tools/roots.hpp @@ -18,10 +18,11 @@ #include #include #include -#include +#include #include #ifndef BOOST_MATH_HAS_GPU_SUPPORT +#include #include #endif @@ -84,7 +85,7 @@ BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f, T& guess, const T& min, const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) - #ifndef BOOST_MATH_HAS_GPU_ENABLED + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif ) @@ -133,7 +134,7 @@ BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f, } // namespace template -boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value && BOOST_MATH_IS_FLOAT(T) +BOOST_MATH_GPU_ENABLED boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value && BOOST_MATH_IS_FLOAT(T) #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif @@ -209,7 +210,7 @@ boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_ } template -inline boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +BOOST_MATH_GPU_ENABLED inline boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif @@ -219,7 +220,7 @@ inline boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::u } template -inline boost::math::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +BOOST_MATH_GPU_ENABLED inline boost::math::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif @@ -231,7 +232,7 @@ inline boost::math::pair bisect(F f, T min, T max, Tol tol) noexcept(polic template -T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif @@ -353,7 +354,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::ui } template -inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +BOOST_MATH_GPU_ENABLED inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()(std::declval())) #endif From 07a5032ee8aec9a4ef8e3e157a5d210182be63fa Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 11:53:30 -0400 Subject: [PATCH 38/61] Add missing header --- include/boost/math/special_functions/ulp.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/special_functions/ulp.hpp b/include/boost/math/special_functions/ulp.hpp index 3c0616db0..5d1617ace 100644 --- a/include/boost/math/special_functions/ulp.hpp +++ b/include/boost/math/special_functions/ulp.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace boost{ namespace math{ namespace detail{ From 7a1543dc64177c5eeb0a3aeaad6f74aa5d75adc2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:17:48 -0400 Subject: [PATCH 39/61] Add missing header --- include/boost/math/special_functions/detail/bessel_jy_asym.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp index 4bb11c2ff..117251b68 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace boost{ namespace math{ namespace detail{ From 65f68411685baf8f39f4f4890af8c3fbc1aba7d5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:17:59 -0400 Subject: [PATCH 40/61] Markup iconv --- include/boost/math/special_functions/detail/iconv.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/boost/math/special_functions/detail/iconv.hpp b/include/boost/math/special_functions/detail/iconv.hpp index 79ac6e905..20889d411 100644 --- a/include/boost/math/special_functions/detail/iconv.hpp +++ b/include/boost/math/special_functions/detail/iconv.hpp @@ -10,26 +10,27 @@ #pragma once #endif +#include #include #include namespace boost { namespace math { namespace detail{ template -inline int iconv_imp(T v, Policy const&, boost::math::true_type const&) +BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const&, boost::math::true_type const&) { return static_cast(v); } template -inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&) +BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&) { BOOST_MATH_STD_USING return iround(v, pol); } template -inline int iconv(T v, Policy const& pol) +BOOST_MATH_GPU_ENABLED inline int iconv(T v, Policy const& pol) { typedef typename boost::math::is_convertible::type tag_type; return iconv_imp(v, pol, tag_type()); From a90078d61e002d3803874739ed0adcdecc58248e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:18:13 -0400 Subject: [PATCH 41/61] Add iround for NVRTC --- .../boost/math/special_functions/round.hpp | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/boost/math/special_functions/round.hpp b/include/boost/math/special_functions/round.hpp index 7a76cd32f..bb99da7e3 100644 --- a/include/boost/math/special_functions/round.hpp +++ b/include/boost/math/special_functions/round.hpp @@ -273,6 +273,30 @@ BOOST_MATH_GPU_ENABLED float round(float x, const Policy&) return ::roundf(x); } +template +BOOST_MATH_GPU_ENABLED int iround(T x) +{ + return static_cast(::lround(x)); +} + +template <> +BOOST_MATH_GPU_ENABLED int iround(float x) +{ + return static_cast(::lroundf(x)); +} + +template +BOOST_MATH_GPU_ENABLED int iround(T x, const Policy&) +{ + return static_cast(::lround(x)); +} + +template +BOOST_MATH_GPU_ENABLED int iround(float x, const Policy&) +{ + return static_cast(::lroundf(x)); +} + template BOOST_MATH_GPU_ENABLED long lround(T x) { From 7f8572e3562be2f0126102594558a6fbee977ec2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:18:31 -0400 Subject: [PATCH 42/61] Add tgamma1pm1 with policy overload for NVRTC --- include/boost/math/special_functions/gamma.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index be3bc504f..f16f8a3b6 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2323,6 +2323,12 @@ BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z) return expm1(lgamma(1 + z)); } +template +BOOST_MATH_GPU_ENABLED T tgamma1pm1(T x, const Policy&) +{ + return tgamma1pm1(x); +} + } // namespace math } // namespace boost From cdf5dbb204ae1e6f851dabec7801a57243368928 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:18:39 -0400 Subject: [PATCH 43/61] Disable header --- include/boost/math/special_functions/next.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/boost/math/special_functions/next.hpp b/include/boost/math/special_functions/next.hpp index 02a208e4e..fd08162f9 100644 --- a/include/boost/math/special_functions/next.hpp +++ b/include/boost/math/special_functions/next.hpp @@ -10,6 +10,11 @@ #pragma once #endif +#include + +// TODO(mborland): Need to remove recurrsion from these algos +#ifndef BOOST_MATH_HAS_NVRTC + #include #include #include @@ -920,4 +925,6 @@ inline typename tools::promote_args::type float_advance(const T& val, int dis }} // boost math namespaces +#endif + #endif // BOOST_MATH_SPECIAL_NEXT_HPP From 857116757286c9698ecf79743f983f3e54a3dd98 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:19:23 -0400 Subject: [PATCH 44/61] Fix factorial support for CUDA platforms --- .../detail/unchecked_factorial.hpp | 96 +++++++++++-------- .../math/special_functions/factorials.hpp | 31 +++--- include/boost/math/tools/config.hpp | 1 + 3 files changed, 74 insertions(+), 54 deletions(-) diff --git a/include/boost/math/special_functions/detail/unchecked_factorial.hpp b/include/boost/math/special_functions/detail/unchecked_factorial.hpp index f7720a2ab..92481f2c6 100644 --- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp +++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp @@ -10,19 +10,23 @@ #pragma once #endif -#ifdef _MSC_VER -#pragma warning(push) // Temporary until lexical cast fixed. -#pragma warning(disable: 4127 4701) -#endif -#include -#ifdef _MSC_VER -#pragma warning(pop) -#endif -#include +#include +#include +#include +#include #include -#include -#include -#include + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +# ifdef _MSC_VER +# pragma warning(push) // Temporary until lexical cast fixed. +# pragma warning(disable: 4127 4701) +# endif +# include +# ifdef _MSC_VER +# pragma warning(pop) +# endif +#endif + #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -46,13 +50,21 @@ struct max_factorial; template struct unchecked_factorial_data; +#ifdef BOOST_MATH_HAS_NVRTC + +// Need fwd decl +template +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i); + +#endif + #ifndef BOOST_MATH_HAS_GPU_SUPPORT template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1.0F, 1.0F, 2.0F, @@ -90,15 +102,15 @@ struct unchecked_factorial_data 0.29523279903960414084761860964352e39F, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1.0F, 1.0F, 2.0F, @@ -204,7 +216,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1.0, 1.0, 2.0, @@ -378,15 +390,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1.0, 1.0, 2.0, @@ -633,7 +645,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1L, 1L, 2L, @@ -807,15 +819,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307L, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1L, 1L, 2L, @@ -1008,7 +1020,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1, 1, 2, @@ -1182,15 +1194,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307Q, } }; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES -constexpr std::array unchecked_factorial_data::factorials; +constexpr boost::math::array unchecked_factorial_data::factorials; #else -const std::array unchecked_factorial_data::factorials = { { +const boost::math::array unchecked_factorial_data::factorials = { { 1, 1, 2, @@ -1402,7 +1414,7 @@ const typename unchecked_factorial_initializer::init unchecked_factorial_init template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { // // If you're foolish enough to instantiate factorial @@ -1416,10 +1428,10 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant(factorial(n)); // See factorial documentation for more detail. // - static_assert(!std::is_integral::value && !std::numeric_limits::is_integer, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value && !boost::math::numeric_limits::is_integer, "Type T must not be an integral type"); // We rely on C++11 thread safe initialization here: - static const std::array factorials = {{ + static const boost::math::array factorials = {{ T(boost::math::tools::convert_from_string("1")), T(boost::math::tools::convert_from_string("1")), T(boost::math::tools::convert_from_string("2")), @@ -1527,7 +1539,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { // // If you're foolish enough to instantiate factorial @@ -1541,7 +1553,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant(factorial(n)); // See factorial documentation for more detail. // - static_assert(!std::is_integral::value && !std::numeric_limits::is_integer, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value && !boost::math::numeric_limits::is_integer, "Type T must not be an integral type"); static const char* const factorial_strings[] = { "1", @@ -1667,13 +1679,13 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant::digits>&) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant::digits>&) { return unchecked_factorial(i); } template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant::digits>&) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant::digits>&) { return unchecked_factorial(i); } @@ -1682,14 +1694,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { return unchecked_factorial(i); } #endif #ifdef BOOST_MATH_USE_FLOAT128 template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { return unchecked_factorial(i); } @@ -1698,14 +1710,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i) { typedef typename boost::math::policies::precision >::type tag_type; return unchecked_factorial_imp(i, tag_type()); } #ifdef BOOST_MATH_USE_FLOAT128 -#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : std::numeric_limits::digits == 113 ? max_factorial::value +#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : boost::math::numeric_limits::digits == 113 ? max_factorial::value #else #define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL #endif @@ -1714,10 +1726,10 @@ template struct max_factorial { static constexpr unsigned value = - std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value - : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value + : boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value #ifndef BOOST_MATH_GPU_ENABLED - : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + : boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL #endif : 100; diff --git a/include/boost/math/special_functions/factorials.hpp b/include/boost/math/special_functions/factorials.hpp index 7229635cb..ec6978bdc 100644 --- a/include/boost/math/special_functions/factorials.hpp +++ b/include/boost/math/special_functions/factorials.hpp @@ -10,10 +10,14 @@ #pragma once #endif -#include +#include +#include +#include +#include #include #include -#include +#include + #ifdef _MSC_VER #pragma warning(push) // Temporary until lexical cast fixed. #pragma warning(disable: 4127 4701) @@ -21,16 +25,14 @@ #ifdef _MSC_VER #pragma warning(pop) #endif -#include -#include namespace boost { namespace math { template -inline T factorial(unsigned i, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); // factorial(n) is not implemented // because it would overflow integral type T for too small n // to be useful. Use instead a floating-point type, @@ -49,7 +51,7 @@ inline T factorial(unsigned i, const Policy& pol) } template -inline T factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i) { return factorial(i, policies::policy<>()); } @@ -72,9 +74,9 @@ inline double factorial(unsigned i) } */ template -T double_factorial(unsigned i, const Policy& pol) +BOOST_MATH_GPU_ENABLED T double_factorial(unsigned i, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING // ADL lookup of std names if(i & 1) { @@ -107,17 +109,20 @@ T double_factorial(unsigned i, const Policy& pol) } template -inline T double_factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T double_factorial(unsigned i) { return double_factorial(i, policies::policy<>()); } +// TODO(mborland): We do not currently have support for tgamma_delta_ratio +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + namespace detail{ template T rising_factorial_imp(T x, int n, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); if(x < 0) { // @@ -165,7 +170,7 @@ T rising_factorial_imp(T x, int n, const Policy& pol) template inline T falling_factorial_imp(T x, unsigned n, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING // ADL of std names if(x == 0) return 0; @@ -262,6 +267,8 @@ inline typename tools::promote_args::type static_cast(x), n, pol); } +#endif // BOOST_MATH_HAS_GPU_SUPPORT + } // namespace math } // namespace boost diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index fda4029d6..1f444c004 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -795,6 +795,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) +#define BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(T) #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING From 861c3f4f9e8010194bd47d89faec9322520a8185 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:19:55 -0400 Subject: [PATCH 45/61] Add definition of bessel traits --- .../boost/math/special_functions/bessel.hpp | 49 +++++++++++++++++-- .../boost/math/special_functions/math_fwd.hpp | 30 +++++++----- 2 files changed, 62 insertions(+), 17 deletions(-) diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index 3bba825da..cfc60af79 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -38,10 +38,7 @@ #include #include #include - -#ifndef BOOST_MATH_HAS_NVRTC #include -#endif #ifdef _MSC_VER # pragma warning(push) @@ -50,6 +47,50 @@ namespace boost{ namespace math{ +// Since we cannot pull this in from math fwd we need a copy +#ifdef BOOST_MATH_HAS_NVRTC + +namespace detail{ + + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. + + template + struct bessel_traits + { + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, + typename tools::promote_args::type, + tools::promote_args_t + >::type; + + typedef typename policies::precision::type precision_type; + + using optimisation_tag = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 64), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + + using optimisation_tag128 = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 113), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + }; + } // detail + +#endif + namespace detail{ template @@ -71,7 +112,7 @@ struct sph_bessel_j_small_z_series_term term = pow(mult, T(v)) / boost::math::tgamma(v+1+T(0.5f), Policy()); mult *= -mult; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 897cf8280..24d24ca74 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -24,12 +24,16 @@ #pragma once #endif +#include + +#ifndef BOOST_MATH_HAS_NVRTC + #include #include #include -#include #include #include // for argument promotion. +#include #include #define BOOST_NO_MACRO_EXPAND /**/ @@ -639,36 +643,36 @@ namespace boost namespace detail{ - typedef std::integral_constant bessel_no_int_tag; // No integer optimisation possible. - typedef std::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. - typedef std::integral_constant bessel_int_tag; // Definite integer optimisation. + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. template struct bessel_traits { - using result_type = typename std::conditional< - std::is_integral::value, + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, typename tools::promote_args::type, tools::promote_args_t >::type; typedef typename policies::precision::type precision_type; - using optimisation_tag = typename std::conditional< + using optimisation_tag = typename boost::math::conditional< (precision_type::value <= 0 || precision_type::value > 64), bessel_no_int_tag, - typename std::conditional< - std::is_integral::value, + typename boost::math::conditional< + boost::math::is_integral::value, bessel_int_tag, bessel_maybe_int_tag >::type >::type; - using optimisation_tag128 = typename std::conditional< + using optimisation_tag128 = typename boost::math::conditional< (precision_type::value <= 0 || precision_type::value > 113), bessel_no_int_tag, - typename std::conditional< - std::is_integral::value, + typename boost::math::conditional< + boost::math::is_integral::value, bessel_int_tag, bessel_maybe_int_tag >::type @@ -1817,6 +1821,6 @@ template \ - +#endif // BOOST_MATH_HAS_NVRTC #endif // BOOST_MATH_SPECIAL_MATH_FWD_HPP From d24cb950124872160f9427e2163df493d9d9ac3f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:20:21 -0400 Subject: [PATCH 46/61] Add cyl_bessel_i NVRTC testing --- test/nvrtc_jamfile | 4 +- test/test_cyl_bessel_i_nvrtc_double.cpp | 190 ++++++++++++++++++++++++ test/test_cyl_bessel_i_nvrtc_float.cpp | 190 ++++++++++++++++++++++++ 3 files changed, 382 insertions(+), 2 deletions(-) create mode 100644 test/test_cyl_bessel_i_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_i_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 1da64aedc..403042ead 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -109,8 +109,8 @@ run test_bessel_y0_nvrtc_double.cpp ; run test_bessel_y0_nvrtc_float.cpp ; run test_bessel_y1_nvrtc_double.cpp ; run test_bessel_y1_nvrtc_float.cpp ; -# run test_cyl_bessel_i_nvrtc_double.cpp ; -# run test_cyl_bessel_i_nvrtc_float.cpp ; +run test_cyl_bessel_i_nvrtc_double.cpp ; +run test_cyl_bessel_i_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_cyl_bessel_i_nvrtc_double.cpp b/test/test_cyl_bessel_i_nvrtc_double.cpp new file mode 100644 index 000000000..50bfc0c79 --- /dev/null +++ b/test/test_cyl_bessel_i_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_i_nvrtc_float.cpp b/test/test_cyl_bessel_i_nvrtc_float.cpp new file mode 100644 index 000000000..c73992a27 --- /dev/null +++ b/test/test_cyl_bessel_i_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 3df8593f9a1cd9a9c411d58c20aad27f789bf483 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:49:23 -0400 Subject: [PATCH 47/61] Fix cyl_bessel_jy warnings --- .../special_functions/detail/bessel_jy.hpp | 26 ++++++++++--------- .../detail/bessel_jy_asym.hpp | 2 +- .../detail/bessel_jy_zero.hpp | 8 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/boost/math/special_functions/detail/bessel_jy.hpp b/include/boost/math/special_functions/detail/bessel_jy.hpp index d43f3050c..33225c647 100644 --- a/include/boost/math/special_functions/detail/bessel_jy.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy.hpp @@ -11,6 +11,8 @@ #endif #include +#include +#include #include #include #include @@ -283,7 +285,7 @@ namespace boost { namespace math { reflect = true; v = -v; // v is non-negative from here } - if (v > static_cast((std::numeric_limits::max)())) + if (v > static_cast((boost::math::numeric_limits::max)())) { *J = *Y = policies::raise_evaluation_error(function, "Order of Bessel function is too large to evaluate: got %1%", v, pol); return 1; // LCOV_EXCL_LINE previous line will throw. @@ -309,10 +311,10 @@ namespace boost { namespace math { else if(kind & need_j) *J = policies::raise_domain_error(function, "Value of Bessel J_v(x) is complex-infinity at %1%", x, pol); // complex infinity else - *J = std::numeric_limits::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J. + *J = boost::math::numeric_limits::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J. if((kind & need_y) == 0) - *Y = std::numeric_limits::quiet_NaN(); // any value will do, not using Y. + *Y = boost::math::numeric_limits::quiet_NaN(); // any value will do, not using Y. else { // We shoud never get here: @@ -332,7 +334,7 @@ namespace boost { namespace math { // and divergent which leads to large errors :-( // Jv = bessel_j_small_z_series(v, x, pol); - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); } else if((x < 1) && (u != 0) && (log(policies::get_epsilon() / 2) > v * log((x/2) * (x/2) / v))) { @@ -343,7 +345,7 @@ namespace boost { namespace math { if(kind&need_j) Jv = bessel_j_small_z_series(v, x, pol); else - Jv = std::numeric_limits::quiet_NaN(); + Jv = boost::math::numeric_limits::quiet_NaN(); if((org_kind&need_y && (!reflect || (cp != 0))) || (org_kind & need_j && (reflect && (sp != 0)))) { @@ -351,7 +353,7 @@ namespace boost { namespace math { Yv = bessel_y_small_z_series(v, x, &Yv_scale, pol); } else - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); } else if((u == 0) && (x < policies::get_epsilon())) { @@ -362,7 +364,7 @@ namespace boost { namespace math { if(kind&need_j) Jv = bessel_j_small_z_series(v, x, pol); else - Jv = std::numeric_limits::quiet_NaN(); + Jv = boost::math::numeric_limits::quiet_NaN(); if((org_kind&need_y && (!reflect || (cp != 0))) || (org_kind & need_j && (reflect && (sp != 0)))) { @@ -370,7 +372,7 @@ namespace boost { namespace math { Yv = bessel_yn_small_z(n, x, &Yv_scale, pol); } else - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); // LCOV_EXCL_STOP } else if(asymptotic_bessel_large_x_limit(v, x)) @@ -380,13 +382,13 @@ namespace boost { namespace math { Yv = asymptotic_bessel_y_large_x_2(v, x, pol); } else - Yv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Yv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. if(kind&need_j) { Jv = asymptotic_bessel_j_large_x_2(v, x, pol); } else - Jv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Jv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. } else if((x > 8) && hankel_PQ(v, x, &p, &q, pol)) { @@ -448,7 +450,7 @@ namespace boost { namespace math { Jv = scale * W / (Yv * fv - Yv1); // Wronskian relation } else - Jv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Jv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. Yv_scale = scale; } else // x in (2, \infty) @@ -563,7 +565,7 @@ namespace boost { namespace math { Yv = prev; } else - Yv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Yv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. } if (reflect) diff --git a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp index 117251b68..51e4efafc 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp @@ -158,7 +158,7 @@ BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(const T& v, c // error rates either side of the divide for v < 10000. // At double precision eps^1/8 ~= 0.01. // - return (std::max)(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon()); + return BOOST_MATH_GPU_SAFE_MAX(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon()); } template diff --git a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp index 2e3b8c6d8..15671c0df 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp @@ -31,6 +31,10 @@ #include #endif + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif + namespace boost { namespace math { namespace detail { @@ -632,4 +636,8 @@ } // namespace bessel_zero } } } // namespace boost::math::detail + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif + #endif // BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ From a9e14285dfd36b3d9ed90e0920bbbc77b263e4cf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:49:41 -0400 Subject: [PATCH 48/61] Fix CUDA forward declarations --- .../boost/math/special_functions/math_fwd.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 24d24ca74..21f51e507 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -424,15 +424,15 @@ namespace boost template struct max_factorial; template - RT factorial(unsigned int); + BOOST_MATH_GPU_ENABLED RT factorial(unsigned int); template - RT factorial(unsigned int, const Policy& pol); + BOOST_MATH_GPU_ENABLED RT factorial(unsigned int, const Policy& pol); template BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT)); template - RT double_factorial(unsigned i); + BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i); template - RT double_factorial(unsigned i, const Policy& pol); + BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i, const Policy& pol); template tools::promote_args_t falling_factorial(RT x, unsigned n); @@ -558,11 +558,11 @@ namespace boost // Hypotenuse function sqrt(x ^ 2 + y ^ 2). template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hypot(T1 x, T2 y); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hypot(T1 x, T2 y, const Policy&); // cbrt - cube root. @@ -1404,10 +1404,10 @@ namespace boost \ using boost::math::max_factorial;\ template \ - inline RT factorial(unsigned int i) { return boost::math::factorial(i, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline RT factorial(unsigned int i) { return boost::math::factorial(i, Policy()); }\ using boost::math::unchecked_factorial;\ template \ - inline RT double_factorial(unsigned i){ return boost::math::double_factorial(i, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline RT double_factorial(unsigned i){ return boost::math::double_factorial(i, Policy()); }\ template \ inline boost::math::tools::promote_args_t falling_factorial(RT x, unsigned n){ return boost::math::falling_factorial(x, n, Policy()); }\ template \ @@ -1469,7 +1469,7 @@ namespace boost \ template \ inline boost::math::tools::promote_args_t \ - hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\ + BOOST_MATH_GPU_ENABLED hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t cbrt(RT z){ return boost::math::cbrt(z, Policy()); }\ From 85ec2f88b4fa3ccaa941e9e50f07ac143dca3cb9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:49:53 -0400 Subject: [PATCH 49/61] Fix maybe-unused variable warning --- include/boost/math/special_functions/bessel.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index cfc60af79..2e0d6afd0 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -155,7 +155,7 @@ template BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) { BOOST_MATH_STD_USING - constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; + if(x < 0) { // better have integer v: @@ -176,6 +176,7 @@ BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, } else { + constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); } } From 2378f1056b0d16c59e91810d262c74d85ee86ff8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 13:50:12 -0400 Subject: [PATCH 50/61] Add CUDA cyl_bessel_j testing --- test/cuda_jamfile | 2 + test/test_cyl_bessel_j_double.cu | 104 +++++++++++++++++++++++++++++++ test/test_cyl_bessel_j_float.cu | 104 +++++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 test/test_cyl_bessel_j_double.cu create mode 100644 test/test_cyl_bessel_j_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index aca93cb31..96c916119 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -115,6 +115,8 @@ run test_bessel_y1_double.cu ; run test_bessel_y1_float.cu ; run test_cyl_bessel_i_double.cu ; run test_cyl_bessel_i_float.cu ; +run test_cyl_bessel_j_double.cu ; +run test_cyl_bessel_j_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_cyl_bessel_j_double.cu b/test/test_cyl_bessel_j_double.cu new file mode 100644 index 000000000..b5d93f1dd --- /dev/null +++ b/test/test_cyl_bessel_j_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_j_float.cu b/test/test_cyl_bessel_j_float.cu new file mode 100644 index 000000000..3edc2a7c9 --- /dev/null +++ b/test/test_cyl_bessel_j_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 768af36de4589e5c6a23162acadb5da0fe1ec5bf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 14:57:38 -0400 Subject: [PATCH 51/61] Add sign overload for lgamma --- .../boost/math/special_functions/gamma.hpp | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index f16f8a3b6..9268ba415 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2296,7 +2296,7 @@ inline BOOST_MATH_GPU_ENABLED float tgamma(float x) { return ::tgammaf(x); } inline BOOST_MATH_GPU_ENABLED double tgamma(double x) { return ::tgamma(x); } template -inline BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&) { return boost::math::tgamma(x); } @@ -2305,11 +2305,30 @@ inline BOOST_MATH_GPU_ENABLED float lgamma(float x) { return ::lgammaf(x); } inline BOOST_MATH_GPU_ENABLED double lgamma(double x) { return ::lgamma(x); } template -inline BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) { return boost::math::lgamma(x); } +template +BOOST_MATH_GPU_ENABLED T lgamma(T x, int* sign, const Policy&) +{ + auto res = boost::math::lgamma(x); + if (sign != nullptr) + { + if (res < 0) + { + *sign = -1; + } + else + { + *sign = 1; + } + } + + return res; +} + template BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z) { From 6eb7e0e63e3d80932bda729e24c4d0ef68d96585 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 14:57:49 -0400 Subject: [PATCH 52/61] Fix warnings --- include/boost/math/special_functions/bessel.hpp | 1 - include/boost/math/special_functions/detail/bessel_jy.hpp | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index 2e0d6afd0..081473442 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -143,7 +143,6 @@ template BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp_final(T v, T x, const bessel_no_int_tag& t, const Policy& pol) { BOOST_MATH_STD_USING - constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; T result_J, y; // LCOV_EXCL_LINE bessel_jy(v, x, &result_J, &y, need_j, pol); diff --git a/include/boost/math/special_functions/detail/bessel_jy.hpp b/include/boost/math/special_functions/detail/bessel_jy.hpp index 33225c647..143dce872 100644 --- a/include/boost/math/special_functions/detail/bessel_jy.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include From 29282360aab50db268d2fb32ae38437e4b4c0f79 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 14:58:09 -0400 Subject: [PATCH 53/61] Add NVRTC cyl_bessel_j testing --- test/nvrtc_jamfile | 2 + test/test_cyl_bessel_j_nvrtc_double.cpp | 190 ++++++++++++++++++++++++ test/test_cyl_bessel_j_nvrtc_float.cpp | 190 ++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_cyl_bessel_j_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_j_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 403042ead..b4b1d2db6 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -111,6 +111,8 @@ run test_bessel_y1_nvrtc_double.cpp ; run test_bessel_y1_nvrtc_float.cpp ; run test_cyl_bessel_i_nvrtc_double.cpp ; run test_cyl_bessel_i_nvrtc_float.cpp ; +run test_cyl_bessel_j_nvrtc_double.cpp ; +run test_cyl_bessel_j_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_cyl_bessel_j_nvrtc_double.cpp b/test/test_cyl_bessel_j_nvrtc_double.cpp new file mode 100644 index 000000000..f74e112ed --- /dev/null +++ b/test/test_cyl_bessel_j_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_j_nvrtc_float.cpp b/test/test_cyl_bessel_j_nvrtc_float.cpp new file mode 100644 index 000000000..e3d792843 --- /dev/null +++ b/test/test_cyl_bessel_j_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 7caad06a4edd5103a0eeed964e37b69d13dedfa3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 14:58:18 -0400 Subject: [PATCH 54/61] Add NVCC sph_bessel testing --- test/cuda_jamfile | 2 + test/test_sph_bessel_double.cu | 119 +++++++++++++++++++++++++++++++++ test/test_sph_bessel_float.cu | 119 +++++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100644 test/test_sph_bessel_double.cu create mode 100644 test/test_sph_bessel_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 96c916119..26c15389c 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -117,6 +117,8 @@ run test_cyl_bessel_i_double.cu ; run test_cyl_bessel_i_float.cu ; run test_cyl_bessel_j_double.cu ; run test_cyl_bessel_j_float.cu ; +run test_sph_bessel_double.cu ; +run test_sph_bessel_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_sph_bessel_double.cu b/test/test_sph_bessel_double.cu new file mode 100644 index 000000000..5229dd8b5 --- /dev/null +++ b/test/test_sph_bessel_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng {42}; + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0, 100); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = order(rng); + input_vector2[i] = val(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + bool failed = false; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_bessel_float.cu b/test/test_sph_bessel_float.cu new file mode 100644 index 000000000..bd068a1a0 --- /dev/null +++ b/test/test_sph_bessel_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng {42}; + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0, 100); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = order(rng); + input_vector2[i] = val(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + bool failed = false; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 150) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 288c84fb7486c01ad1c7e66bc5b4ca3551d0ab5b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 15:44:09 -0400 Subject: [PATCH 55/61] Add NVRTC testing of sph_bessel --- test/nvrtc_jamfile | 2 + test/test_sph_bessel_nvrtc_double.cpp | 199 ++++++++++++++++++++++++++ test/test_sph_bessel_nvrtc_float.cpp | 199 ++++++++++++++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 test/test_sph_bessel_nvrtc_double.cpp create mode 100644 test/test_sph_bessel_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index b4b1d2db6..8ff83ba36 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -113,6 +113,8 @@ run test_cyl_bessel_i_nvrtc_double.cpp ; run test_cyl_bessel_i_nvrtc_float.cpp ; run test_cyl_bessel_j_nvrtc_double.cpp ; run test_cyl_bessel_j_nvrtc_float.cpp ; +run test_sph_bessel_nvrtc_double.cpp ; +run test_sph_bessel_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_sph_bessel_nvrtc_double.cpp b/test/test_sph_bessel_nvrtc_double.cpp new file mode 100644 index 000000000..e88726ed7 --- /dev/null +++ b/test/test_sph_bessel_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + unsigned *h_in1, *d_in1; + float_type *h_in2, *h_out; + float_type *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new unsigned[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0.0f, 100.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(order(rng)); + h_in2[i] = static_cast(val(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + bool failed = false; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + failed = true; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (failed) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_bessel_nvrtc_float.cpp b/test/test_sph_bessel_nvrtc_float.cpp new file mode 100644 index 000000000..c9538cd5b --- /dev/null +++ b/test/test_sph_bessel_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + unsigned *h_in1, *d_in1; + float_type *h_in2, *h_out; + float_type *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new unsigned[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0.0f, 100.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(order(rng)); + h_in2[i] = static_cast(val(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + bool failed = false; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + failed = true; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (failed) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 288275e31f14bc3b5bba779b37a6fb0f537334ca Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 15:53:30 -0400 Subject: [PATCH 56/61] Add NVRTC testing of cyl_bessel_k --- test/nvrtc_jamfile | 2 + test/test_cyl_bessel_k_nvrtc_double.cpp | 190 ++++++++++++++++++++++++ test/test_cyl_bessel_k_nvrtc_float.cpp | 190 ++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_cyl_bessel_k_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_k_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 8ff83ba36..9854bd746 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -113,6 +113,8 @@ run test_cyl_bessel_i_nvrtc_double.cpp ; run test_cyl_bessel_i_nvrtc_float.cpp ; run test_cyl_bessel_j_nvrtc_double.cpp ; run test_cyl_bessel_j_nvrtc_float.cpp ; +run test_cyl_bessel_k_nvrtc_double.cpp ; +run test_cyl_bessel_k_nvrtc_float.cpp ; run test_sph_bessel_nvrtc_double.cpp ; run test_sph_bessel_nvrtc_float.cpp ; diff --git a/test/test_cyl_bessel_k_nvrtc_double.cpp b/test/test_cyl_bessel_k_nvrtc_double.cpp new file mode 100644 index 000000000..66a8b1490 --- /dev/null +++ b/test/test_cyl_bessel_k_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_k_nvrtc_float.cpp b/test/test_cyl_bessel_k_nvrtc_float.cpp new file mode 100644 index 000000000..e23ff82c0 --- /dev/null +++ b/test/test_cyl_bessel_k_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From a96cd021250a2026985b2adccf4d6e59620637c2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 15:57:12 -0400 Subject: [PATCH 57/61] Add NVCC testing of cyl_bessel_k --- test/cuda_jamfile | 2 + test/test_cyl_bessel_k_double.cu | 104 +++++++++++++++++++++++++++++++ test/test_cyl_bessel_k_float.cu | 104 +++++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 test/test_cyl_bessel_k_double.cu create mode 100644 test/test_cyl_bessel_k_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 26c15389c..cde35e86d 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -117,6 +117,8 @@ run test_cyl_bessel_i_double.cu ; run test_cyl_bessel_i_float.cu ; run test_cyl_bessel_j_double.cu ; run test_cyl_bessel_j_float.cu ; +run test_cyl_bessel_k_double.cu ; +run test_cyl_bessel_k_float.cu ; run test_sph_bessel_double.cu ; run test_sph_bessel_float.cu ; diff --git a/test/test_cyl_bessel_k_double.cu b/test/test_cyl_bessel_k_double.cu new file mode 100644 index 000000000..3dfd2bf38 --- /dev/null +++ b/test/test_cyl_bessel_k_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_k_float.cu b/test/test_cyl_bessel_k_float.cu new file mode 100644 index 000000000..b874857a0 --- /dev/null +++ b/test/test_cyl_bessel_k_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 854fc6fb80a61eecbbf437939b77bc5eb1347e32 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 16:11:48 -0400 Subject: [PATCH 58/61] Add NVCC testing of cyl_neumann --- test/cuda_jamfile | 2 + test/test_cyl_neumann_double.cu | 116 ++++++++++++++++++++++++++++++++ test/test_cyl_neumann_float.cu | 104 ++++++++++++++++++++++++++++ 3 files changed, 222 insertions(+) create mode 100644 test/test_cyl_neumann_double.cu create mode 100644 test/test_cyl_neumann_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index cde35e86d..6dd78fdd8 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -121,6 +121,8 @@ run test_cyl_bessel_k_double.cu ; run test_cyl_bessel_k_float.cu ; run test_sph_bessel_double.cu ; run test_sph_bessel_float.cu ; +run test_cyl_neumann_double.cu ; +run test_cyl_neumann_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_cyl_neumann_double.cu b/test/test_cyl_neumann_double.cu new file mode 100644 index 000000000..0e7a72ff9 --- /dev/null +++ b/test/test_cyl_neumann_double.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_neumann_float.cu b/test/test_cyl_neumann_float.cu new file mode 100644 index 000000000..f621d2fc6 --- /dev/null +++ b/test/test_cyl_neumann_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 48353cf10d3b4a9e3595cabd73a17b5c2b131102 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 16:20:10 -0400 Subject: [PATCH 59/61] Add NVRTC cyl_neumann testing --- test/nvrtc_jamfile | 2 + test/test_cyl_neumann_nvrtc_double.cpp | 190 +++++++++++++++++++++++++ test/test_cyl_neumann_nvrtc_float.cpp | 190 +++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_cyl_neumann_nvrtc_double.cpp create mode 100644 test/test_cyl_neumann_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 9854bd746..835dd78ec 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -117,6 +117,8 @@ run test_cyl_bessel_k_nvrtc_double.cpp ; run test_cyl_bessel_k_nvrtc_float.cpp ; run test_sph_bessel_nvrtc_double.cpp ; run test_sph_bessel_nvrtc_float.cpp ; +run test_cyl_neumann_nvrtc_double.cpp ; +run test_cyl_neumann_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_cyl_neumann_nvrtc_double.cpp b/test/test_cyl_neumann_nvrtc_double.cpp new file mode 100644 index 000000000..78bbd3b5c --- /dev/null +++ b/test/test_cyl_neumann_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_neumann_nvrtc_float.cpp b/test/test_cyl_neumann_nvrtc_float.cpp new file mode 100644 index 000000000..78bbd3b5c --- /dev/null +++ b/test/test_cyl_neumann_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4d78c2c16bf0ec370b1e77f63cc6321de09dbee4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 16:24:03 -0400 Subject: [PATCH 60/61] Add NVRTC sph_neumann testing --- test/nvrtc_jamfile | 2 + test/test_sph_neumann_nvrtc_double.cpp | 190 +++++++++++++++++++++++++ test/test_sph_neumann_nvrtc_float.cpp | 190 +++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_sph_neumann_nvrtc_double.cpp create mode 100644 test/test_sph_neumann_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 835dd78ec..de235822e 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -119,6 +119,8 @@ run test_sph_bessel_nvrtc_double.cpp ; run test_sph_bessel_nvrtc_float.cpp ; run test_cyl_neumann_nvrtc_double.cpp ; run test_cyl_neumann_nvrtc_float.cpp ; +run test_sph_neumann_nvrtc_double.cpp ; +run test_sph_neumann_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_sph_neumann_nvrtc_double.cpp b/test/test_sph_neumann_nvrtc_double.cpp new file mode 100644 index 000000000..61dcb07dd --- /dev/null +++ b/test/test_sph_neumann_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_neumann_nvrtc_float.cpp b/test/test_sph_neumann_nvrtc_float.cpp new file mode 100644 index 000000000..5d7ae59fe --- /dev/null +++ b/test/test_sph_neumann_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From a9eb567fabbc3f5c4a4d46e2e482bd454aa046cf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 16:27:40 -0400 Subject: [PATCH 61/61] Add NVCC sph_neumann testing --- test/cuda_jamfile | 2 + test/test_sph_neumann_double.cu | 116 ++++++++++++++++++++++++++++++++ test/test_sph_neumann_float.cu | 116 ++++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+) create mode 100644 test/test_sph_neumann_double.cu create mode 100644 test/test_sph_neumann_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 6dd78fdd8..a061fe02a 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -123,6 +123,8 @@ run test_sph_bessel_double.cu ; run test_sph_bessel_float.cu ; run test_cyl_neumann_double.cu ; run test_cyl_neumann_float.cu ; +run test_sph_neumann_double.cu ; +run test_sph_neumann_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_sph_neumann_double.cu b/test/test_sph_neumann_double.cu new file mode 100644 index 000000000..f59dc7acc --- /dev/null +++ b/test/test_sph_neumann_double.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_neumann_float.cu b/test/test_sph_neumann_float.cu new file mode 100644 index 000000000..a295e376f --- /dev/null +++ b/test/test_sph_neumann_float.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +}