diff --git a/doc/sf/airy.qbk b/doc/sf/airy.qbk index 5ff4c7cb5..4756bee2d 100644 --- a/doc/sf/airy.qbk +++ b/doc/sf/airy.qbk @@ -18,10 +18,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` airy_ai(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x); template - ``__sf_result`` airy_ai(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x, const Policy&); }} // namespaces @@ -78,10 +78,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_bi(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x); template - ``__sf_result`` airy_bi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x, const Policy&); }} // namespaces @@ -132,10 +132,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_ai_prime(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x); template - ``__sf_result`` airy_ai_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x, const Policy&); }} // namespaces @@ -186,10 +186,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_bi_prime(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x); template - ``__sf_result`` airy_bi_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x, const Policy&); }} // namespaces @@ -242,23 +242,23 @@ by providing an output iterator. The signature of the single value functions are: template - T airy_ai_zero( + BOOST_MATH_GPU_ENABLED T airy_ai_zero( int m); // 1-based index of zero. template - T airy_bi_zero( + BOOST_MATH_GPU_ENABLED T airy_bi_zero( int m); // 1-based index of zero. and for multiple zeros: template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, // 1-based index of first zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it); // Destination for zeros. template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, // 1-based index of zero. unsigned number_of_zeros, // How many zeros to generate OutputIterator out_it); // Destination for zeros. @@ -266,25 +266,25 @@ and for multiple zeros: There are also versions which allow control of the __policy_section for error handling and precision. template - T airy_ai_zero( + BOOST_MATH_GPU_ENABLED T airy_ai_zero( int m, // 1-based index of zero. const Policy&); // Policy to use. template - T airy_bi_zero( + BOOST_MATH_GPU_ENABLED T airy_bi_zero( int m, // 1-based index of zero. const Policy&); // Policy to use. template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, // 1-based index of first zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it, // Destination for zeros. const Policy& pol); // Policy to use. template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, // 1-based index of zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it, // Destination for zeros. diff --git a/doc/sf/expint.qbk b/doc/sf/expint.qbk index 89554730d..f0abf090e 100644 --- a/doc/sf/expint.qbk +++ b/doc/sf/expint.qbk @@ -11,10 +11,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` expint(unsigned n, T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z); template - ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); }} // namespaces @@ -26,10 +26,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` expint(unsigned n, T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z); template - ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); Returns the [@http://mathworld.wolfram.com/En-Function.html exponential integral En] of z: @@ -100,10 +100,10 @@ is used. namespace boost{ namespace math{ template - ``__sf_result`` expint(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z); template - ``__sf_result`` expint(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&); }} // namespaces @@ -115,10 +115,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` expint(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z); template - ``__sf_result`` expint(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&); Returns the [@http://mathworld.wolfram.com/ExponentialIntegral.html exponential integral] of z: diff --git a/doc/sf/gegenbauer.qbk b/doc/sf/gegenbauer.qbk index a6afc53d8..69671917c 100644 --- a/doc/sf/gegenbauer.qbk +++ b/doc/sf/gegenbauer.qbk @@ -16,13 +16,13 @@ namespace boost{ namespace math{ template - Real gegenbauer(unsigned n, Real lambda, Real x); + BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x); template - Real gegenbauer_prime(unsigned n, Real lambda, Real x); + BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x); template - Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k); + BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k); }} // namespaces diff --git a/doc/sf/hankel.qbk b/doc/sf/hankel.qbk index 4d8a5eda1..05d65201b 100644 --- a/doc/sf/hankel.qbk +++ b/doc/sf/hankel.qbk @@ -3,18 +3,36 @@ [h4 Synopsis] + #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) + template - std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); template - std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); template - std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); template - std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + #else // When using cuda we use namespace cuda::std:: instead of std:: + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + + #endif + [h4 Description] @@ -77,18 +95,35 @@ routines for integer order are used. [h4 Synopsis] + #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) + template - std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); template - std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); template - std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + #else // When using cuda we use namespace cuda::std:: instead of std:: + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); + template - std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + + #endif [h4 Description] diff --git a/doc/sf/hermite.qbk b/doc/sf/hermite.qbk index c88aadc34..965aa8092 100644 --- a/doc/sf/hermite.qbk +++ b/doc/sf/hermite.qbk @@ -9,13 +9,13 @@ namespace boost{ namespace math{ template - ``__sf_result`` hermite(unsigned n, T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x); template - ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); template - ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); }} // namespaces @@ -26,10 +26,10 @@ note than when there is a single template argument the result is the same type as that argument or `double` if the template argument is an integer type. template - ``__sf_result`` hermite(unsigned n, T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x); template - ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); Returns the value of the Hermite Polynomial of order /n/ at point /x/: @@ -43,7 +43,7 @@ Hermite Polynomials: [graph hermite] template - ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); Implements the three term recurrence relation for the Hermite polynomials, this function can be used to create a sequence of diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index 081473442..c32f251bc 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -47,50 +47,6 @@ namespace boost{ namespace math{ -// Since we cannot pull this in from math fwd we need a copy -#ifdef BOOST_MATH_HAS_NVRTC - -namespace detail{ - - typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. - typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. - typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. - - template - struct bessel_traits - { - using result_type = typename boost::math::conditional< - boost::math::is_integral::value, - typename tools::promote_args::type, - tools::promote_args_t - >::type; - - typedef typename policies::precision::type precision_type; - - using optimisation_tag = typename boost::math::conditional< - (precision_type::value <= 0 || precision_type::value > 64), - bessel_no_int_tag, - typename boost::math::conditional< - boost::math::is_integral::value, - bessel_int_tag, - bessel_maybe_int_tag - >::type - >::type; - - using optimisation_tag128 = typename boost::math::conditional< - (precision_type::value <= 0 || precision_type::value > 113), - bessel_no_int_tag, - typename boost::math::conditional< - boost::math::is_integral::value, - bessel_int_tag, - bessel_maybe_int_tag - >::type - >::type; - }; - } // detail - -#endif - namespace detail{ template diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp index 1475a9a88..09e97bd4f 100644 --- a/include/boost/math/special_functions/expint.hpp +++ b/include/boost/math/special_functions/expint.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2007. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -12,6 +13,10 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include #include @@ -20,7 +25,6 @@ #include #include #include -#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -35,13 +39,13 @@ namespace boost{ namespace math{ template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(unsigned n, T z, const Policy& /*pol*/); namespace detail{ template -inline T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T expint_1_rational(const T& z, const boost::math::integral_constant&) { // this function is never actually called BOOST_MATH_ASSERT(0); @@ -49,7 +53,7 @@ inline T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -123,7 +127,7 @@ T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -204,7 +208,7 @@ T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -351,14 +355,15 @@ T expint_1_rational(const T& z, const std::integral_constant&) return result; } + template struct expint_fraction { - typedef std::pair result_type; - expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){} - std::pair operator()() + typedef boost::math::pair result_type; + BOOST_MATH_GPU_ENABLED expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){} + BOOST_MATH_GPU_ENABLED boost::math::pair operator()() { - std::pair result = std::make_pair(-static_cast((i+1) * (n+i)), b); + boost::math::pair result = boost::math::make_pair(-static_cast((i+1) * (n+i)), b); b += 2; ++i; return result; @@ -370,11 +375,11 @@ struct expint_fraction }; template -inline T expint_as_fraction(unsigned n, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T expint_as_fraction(unsigned n, T z, const Policy& pol) { BOOST_MATH_STD_USING BOOST_MATH_INSTRUMENT_VARIABLE(z) - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); expint_fraction f(n, z); T result = tools::continued_fraction_b( f, @@ -392,9 +397,9 @@ template struct expint_series { typedef T result_type; - expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_) + BOOST_MATH_GPU_ENABLED expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_) : k(k_), z(z_), x_k(x_k_), denom(denom_), fact(fact_){} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { x_k *= -z; denom += 1; @@ -410,10 +415,10 @@ struct expint_series }; template -inline T expint_as_series(unsigned n, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T expint_as_series(unsigned n, T z, const Policy& pol) { BOOST_MATH_STD_USING - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); BOOST_MATH_INSTRUMENT_VARIABLE(z) @@ -443,10 +448,10 @@ inline T expint_as_series(unsigned n, T z, const Policy& pol) } template -T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) +BOOST_MATH_GPU_ENABLED T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(unsigned, %1%)"; + constexpr auto function = "boost::math::expint<%1%>(unsigned, %1%)"; if(z < 0) return policies::raise_domain_error(function, "Function requires z >= 0 but got %1%.", z, pol); if(z == 0) @@ -468,15 +473,21 @@ T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) # pragma warning(disable:4127) // conditional expression is constant #endif if(n == 0) + { result = exp(-z) / z; + } else if((n == 1) && (Tag::value)) { result = expint_1_rational(z, tag); } else if(f) + { result = expint_as_series(n, z, pol); + } else + { result = expint_as_fraction(n, z, pol); + } #ifdef _MSC_VER # pragma warning(pop) #endif @@ -488,8 +499,8 @@ template struct expint_i_series { typedef T result_type; - expint_i_series(T z_) : k(0), z_k(1), z(z_){} - T operator()() + BOOST_MATH_GPU_ENABLED expint_i_series(T z_) : k(0), z_k(1), z(z_){} + BOOST_MATH_GPU_ENABLED T operator()() { z_k *= z / ++k; return z_k / k; @@ -501,22 +512,22 @@ struct expint_i_series }; template -T expint_i_as_series(T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T expint_i_as_series(T z, const Policy& pol) { BOOST_MATH_STD_USING T result = log(z); // (log(z) - log(1 / z)) / 2; result += constants::euler(); expint_i_series s(z); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); result = tools::sum_series(s, policies::get_epsilon(), max_iter, result); policies::check_series_iterations("boost::math::expint_i_series<%1%>(%1%)", max_iter, pol); return result; } template -T expint_i_imp(T z, const Policy& pol, const Tag& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const Tag& tag) { - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -525,10 +536,10 @@ T expint_i_imp(T z, const Policy& pol, const Tag& tag) } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -541,7 +552,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta // Maximum Deviation Found: 2.852e-18 // Expected Error Term: 2.852e-18 // Max Error found at double precision = Poly: 2.636335e-16 Cheb: 4.187027e-16 - static const T P[10] = { + BOOST_MATH_STATIC const T P[10] = { BOOST_MATH_BIG_CONSTANT(T, 53, 2.98677224343598593013), BOOST_MATH_BIG_CONSTANT(T, 53, 0.356343618769377415068), BOOST_MATH_BIG_CONSTANT(T, 53, 0.780836076283730801839), @@ -553,7 +564,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, 0.798296365679269702435e-5), BOOST_MATH_BIG_CONSTANT(T, 53, 0.2777056254402008721e-6) }; - static const T Q[8] = { + BOOST_MATH_STATIC const T Q[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, -1.17090412365413911947), BOOST_MATH_BIG_CONSTANT(T, 53, 0.62215109846016746276), @@ -564,11 +575,11 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, -0.138972589601781706598e-4) }; - static const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0); - static const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0); - static const T r1 = static_cast(c1 / c2); - static const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16); - static const T r = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392)); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r1 = static_cast(c1 / c2); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392)); T t = (z / 3) - 1; result = tools::evaluate_polynomial(P, t) / tools::evaluate_polynomial(Q, t); @@ -588,8 +599,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta // Maximum Deviation Found: 6.546e-17 // Expected Error Term: 6.546e-17 // Max Error found at double precision = Poly: 6.890169e-17 Cheb: 6.772128e-17 - static const T Y = 1.158985137939453125F; - static const T P[8] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.158985137939453125F; + BOOST_MATH_STATIC const T P[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 0.00139324086199402804173), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0349921221823888744966), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0264095520754134848538), @@ -599,7 +610,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, -0.554086272024881826253e-4), BOOST_MATH_BIG_CONSTANT(T, 53, -0.396487648924804510056e-5) }; - static const T Q[8] = { + BOOST_MATH_STATIC const T Q[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 0.744625566823272107711), BOOST_MATH_BIG_CONSTANT(T, 53, 0.329061095011767059236), @@ -621,8 +632,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta // Expected Error Term: -1.842e-17 // Max Error found at double precision = Poly: 4.375868e-17 Cheb: 5.860967e-17 - static const T Y = 1.0869731903076171875F; - static const T P[9] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.0869731903076171875F; + BOOST_MATH_STATIC const T P[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.00893891094356945667451), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0484607730127134045806), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0652810444222236895772), @@ -633,7 +644,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, -0.000209750022660200888349), BOOST_MATH_BIG_CONSTANT(T, 53, -0.138652200349182596186e-4) }; - static const T Q[9] = { + BOOST_MATH_STATIC const T Q[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 1.97017214039061194971), BOOST_MATH_BIG_CONSTANT(T, 53, 1.86232465043073157508), @@ -657,8 +668,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta // Max Error found at double precision = Poly: 1.441088e-16 Cheb: 1.864792e-16 - static const T Y = 1.03937530517578125F; - static const T P[9] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.03937530517578125F; + BOOST_MATH_STATIC const T P[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.00356165148914447597995), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0229930320357982333406), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0449814350482277917716), @@ -669,7 +680,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, -0.000192178045857733706044), BOOST_MATH_BIG_CONSTANT(T, 53, -0.113161784705911400295e-9) }; - static const T Q[9] = { + BOOST_MATH_STATIC const T Q[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 2.84354408840148561131), BOOST_MATH_BIG_CONSTANT(T, 53, 3.6599610090072393012), @@ -688,9 +699,9 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta else { // Max Error found at double precision = 3.381886e-17 - static const T exp40 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17)); - static const T Y= 1.013065338134765625F; - static const T P[6] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T exp40 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17)); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y= 1.013065338134765625F; + BOOST_MATH_STATIC const T P[6] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.0130653381347656243849), BOOST_MATH_BIG_CONSTANT(T, 53, 0.19029710559486576682), BOOST_MATH_BIG_CONSTANT(T, 53, 94.7365094537197236011), @@ -698,7 +709,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta BOOST_MATH_BIG_CONSTANT(T, 53, 18932.0850014925993025), BOOST_MATH_BIG_CONSTANT(T, 53, -38703.1431362056714134) }; - static const T Q[7] = { + BOOST_MATH_STATIC const T Q[7] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 61.9733592849439884145), BOOST_MATH_BIG_CONSTANT(T, 53, -2354.56211323420194283), @@ -739,10 +750,10 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -976,7 +987,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta } template -void expint_i_imp_113a(T& result, const T& z, const Policy& pol) +BOOST_MATH_GPU_ENABLED void expint_i_imp_113a(T& result, const T& z, const Policy& pol) { BOOST_MATH_STD_USING // Maximum Deviation Found: 1.230e-36 @@ -1044,7 +1055,7 @@ void expint_i_imp_113a(T& result, const T& z, const Policy& pol) } template -void expint_i_113b(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113b(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 7.779e-36 @@ -1094,7 +1105,7 @@ void expint_i_113b(T& result, const T& z) } template -void expint_i_113c(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113c(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 1.082e-34 @@ -1147,7 +1158,7 @@ void expint_i_113c(T& result, const T& z) } template -void expint_i_113d(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113d(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 3.163e-35 @@ -1198,7 +1209,7 @@ void expint_i_113d(T& result, const T& z) } template -void expint_i_113e(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113e(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 7.972e-36 @@ -1252,7 +1263,7 @@ void expint_i_113e(T& result, const T& z) } template -void expint_i_113f(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113f(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 4.469e-36 @@ -1299,7 +1310,7 @@ void expint_i_113f(T& result, const T& z) } template -void expint_i_113g(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113g(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 5.588e-35 @@ -1344,7 +1355,7 @@ void expint_i_113g(T& result, const T& z) } template -void expint_i_113h(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113h(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 4.448e-36 @@ -1383,10 +1394,10 @@ void expint_i_113h(T& result, const T& z) } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -1491,12 +1502,12 @@ struct expint_i_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1504,7 +1515,7 @@ struct expint_i_initializer boost::math::expint(T(38), Policy()); boost::math::expint(T(45), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1512,7 +1523,7 @@ struct expint_i_initializer boost::math::expint(T(38), Policy()); boost::math::expint(T(45), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1524,12 +1535,14 @@ struct expint_i_initializer boost::math::expint(T(200), Policy()); boost::math::expint(T(220), Policy()); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; static const init initializer; - static void force_instantiate() + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1541,33 +1554,35 @@ struct expint_1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); boost::math::expint(1, T(6), Policy()); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; static const init initializer; - static void force_instantiate() + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1575,8 +1590,8 @@ template const typename expint_1_initializer::init expint_1_initializer::initializer; template -inline typename tools::promote_args::type - expint_forwarder(T z, const Policy& /*pol*/, std::true_type const&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type + expint_forwarder(T z, const Policy& /*pol*/, boost::math::true_type const&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -1587,7 +1602,7 @@ inline typename tools::promote_args::type policies::promote_double, policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::type } template -inline typename tools::promote_args::type -expint_forwarder(unsigned n, T z, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type +expint_forwarder(unsigned n, T z, const boost::math::false_type&) { return boost::math::expint(n, z, policies::policy<>()); } @@ -1612,7 +1627,7 @@ expint_forwarder(unsigned n, T z, const std::false_type&) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(unsigned n, T z, const Policy& /*pol*/) { typedef typename tools::promote_args::type result_type; @@ -1624,7 +1639,7 @@ inline typename tools::promote_args::type policies::promote_double, policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::type } template -inline typename detail::expint_result::type +BOOST_MATH_GPU_ENABLED inline typename detail::expint_result::type expint(T const z, U const u) { typedef typename policies::is_policy::type tag_type; @@ -1649,7 +1664,7 @@ inline typename detail::expint_result::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(T z) { return expint(z, policies::policy<>()); diff --git a/include/boost/math/special_functions/gegenbauer.hpp b/include/boost/math/special_functions/gegenbauer.hpp index b7033cd14..70324cf65 100644 --- a/include/boost/math/special_functions/gegenbauer.hpp +++ b/include/boost/math/special_functions/gegenbauer.hpp @@ -1,4 +1,5 @@ // (C) Copyright Nick Thompson 2019. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -6,21 +7,25 @@ #ifndef BOOST_MATH_SPECIAL_GEGENBAUER_HPP #define BOOST_MATH_SPECIAL_GEGENBAUER_HPP -#include +#include +#include +#include + +#ifndef BOOST_MATH_NO_EXCEPTIONS #include -#include +#endif namespace boost { namespace math { template -Real gegenbauer(unsigned n, Real lambda, Real x) +BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x) { - static_assert(!std::is_integral::value, "Gegenbauer polynomials required floating point arguments."); + static_assert(!boost::math::is_integral::value, "Gegenbauer polynomials required floating point arguments."); if (lambda <= -1/Real(2)) { #ifndef BOOST_MATH_NO_EXCEPTIONS throw std::domain_error("lambda > -1/2 is required."); #else - return std::numeric_limits::quiet_NaN(); + return boost::math::numeric_limits::quiet_NaN(); #endif } // The only reason to do this is because of some instability that could be present for x < 0 that is not present for x > 0. @@ -41,7 +46,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x) Real yk = y1; Real k = 2; - Real k_max = n*(1+std::numeric_limits::epsilon()); + Real k_max = n*(1+boost::math::numeric_limits::epsilon()); Real gamma = 2*(lambda - 1); while(k < k_max) { @@ -55,7 +60,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x) template -Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) +BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) { if (k > n) { return Real(0); @@ -70,7 +75,7 @@ Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) } template -Real gegenbauer_prime(unsigned n, Real lambda, Real x) { +BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x) { return gegenbauer_derivative(n, lambda, x, 1); } diff --git a/include/boost/math/special_functions/hankel.hpp b/include/boost/math/special_functions/hankel.hpp index 51b8390d9..730c7afa0 100644 --- a/include/boost/math/special_functions/hankel.hpp +++ b/include/boost/math/special_functions/hankel.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2012. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -7,26 +8,31 @@ #ifndef BOOST_MATH_HANKEL_HPP #define BOOST_MATH_HANKEL_HPP +#include +#include #include #include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ template -std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED boost::math::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING - static const char* function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"; if(x < 0) { bool isint_v = floor(v) == v; T j, y; bessel_jy(v, -x, &j, &y, need_j | need_y, pol); - std::complex cx(x), cv(v); - std::complex j_result, y_result; + boost::math::complex cx(x), cv(v); + boost::math::complex j_result, y_result; if(isint_v) { int s = (iround(v) & 1) ? -1 : 1; @@ -37,12 +43,12 @@ std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol { j_result = pow(cx, v) * pow(-cx, -v) * j; T p1 = pow(-x, v); - std::complex p2 = pow(cx, v); + boost::math::complex p2 = pow(cx, v); y_result = p1 * y / p2 + (p2 / p1 - p1 / p2) * j / tan(constants::pi() * v); } // multiply y_result by i: - y_result = std::complex(-sign * y_result.imag(), sign * y_result.real()); + y_result = boost::math::complex(-sign * y_result.imag(), sign * y_result.real()); return j_result + y_result; } @@ -51,25 +57,25 @@ std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol if(v == 0) { // J is 1, Y is -INF - return std::complex(1, sign * -policies::raise_overflow_error(function, nullptr, pol)); + return boost::math::complex(1, sign * -policies::raise_overflow_error(function, nullptr, pol)); } else { // At least one of J and Y is complex infinity: - return std::complex(policies::raise_overflow_error(function, nullptr, pol), sign * policies::raise_overflow_error(function, nullptr, pol)); + return boost::math::complex(policies::raise_overflow_error(function, nullptr, pol), sign * policies::raise_overflow_error(function, nullptr, pol)); } } T j, y; bessel_jy(v, x, &j, &y, need_j | need_y, pol); - return std::complex(j, sign * y); + return boost::math::complex(j, sign * y); } template -std::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); +BOOST_MATH_GPU_ENABLED boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); template -inline std::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING // ADL of std names. int ival = detail::iconv(v, pol); @@ -81,57 +87,57 @@ inline std::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const P } template -inline std::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING - if((std::abs(v) < 200) && (x > 0)) - return std::complex(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol)); + if((abs(v) < 200) && (x > 0)) + return boost::math::complex(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol)); return hankel_imp(static_cast(v), x, bessel_no_int_tag(), pol, sign); } template -inline std::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) { BOOST_MATH_STD_USING - return constants::root_half_pi() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(std::complex(x)); + return constants::root_half_pi() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex(x)); } } // namespace detail template -inline std::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; typedef typename detail::bessel_traits::optimisation_tag tag_type; typedef typename policies::evaluation::type value_type; - return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> cyl_hankel_1(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x) { return cyl_hankel_1(v, x, policies::policy<>()); } template -inline std::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; typedef typename detail::bessel_traits::optimisation_tag tag_type; typedef typename policies::evaluation::type value_type; - return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> cyl_hankel_2(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x) { return cyl_hankel_2(v, x, policies::policy<>()); } template -inline std::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -143,17 +149,17 @@ inline std::complex::result_type> policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> sph_hankel_1(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x) { return sph_hankel_1(v, x, policies::policy<>()); } template -inline std::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -165,11 +171,11 @@ inline std::complex::result_type> policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> sph_hankel_2(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x) { return sph_hankel_2(v, x, policies::policy<>()); } diff --git a/include/boost/math/special_functions/hermite.hpp b/include/boost/math/special_functions/hermite.hpp index 81ccb2ac6..3d77fc03e 100644 --- a/include/boost/math/special_functions/hermite.hpp +++ b/include/boost/math/special_functions/hermite.hpp @@ -1,5 +1,6 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,8 +12,9 @@ #pragma once #endif -#include #include +#include +#include #include namespace boost{ @@ -20,7 +22,7 @@ namespace math{ // Recurrence relation for Hermite polynomials: template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1) { using promoted_type = tools::promote_args_t; @@ -31,7 +33,7 @@ namespace detail{ // Implement Hermite polynomials via recurrence: template -T hermite_imp(unsigned n, T x) +BOOST_MATH_GPU_ENABLED T hermite_imp(unsigned n, T x) { T p0 = 1; T p1 = 2 * x; @@ -43,7 +45,7 @@ T hermite_imp(unsigned n, T x) while(c < n) { - std::swap(p0, p1); + BOOST_MATH_GPU_SAFE_SWAP(p0, p1); p1 = static_cast(hermite_next(c, x, p0, p1)); ++c; } @@ -53,7 +55,7 @@ T hermite_imp(unsigned n, T x) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite(unsigned n, T x, const Policy&) { typedef typename tools::promote_args::type result_type; @@ -62,7 +64,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite(unsigned n, T x) { return boost::math::hermite(n, x, policies::policy<>()); diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 35f8dd1bb..91dde74cc 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -27,6 +27,7 @@ #include #include // for argument promotion. #include +#include #include #ifdef BOOST_MATH_HAS_NVRTC @@ -50,7 +51,53 @@ namespace detail{ >::type; }; - } // namespace detail + template + struct expint_result + { + using type = typename boost::math::conditional< + policies::is_policy::value, + tools::promote_args_t, + typename tools::promote_args::type + >::type; + }; + + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. + + template + struct bessel_traits + { + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, + typename tools::promote_args::type, + tools::promote_args_t + >::type; + + typedef typename policies::precision::type precision_type; + + using optimisation_tag = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 64), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + + using optimisation_tag128 = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 113), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + }; + +} // namespace detail } // namespace math } // namespace boost @@ -284,15 +331,15 @@ namespace boost laguerre(unsigned n, T1 m, T2 x); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite(unsigned n, T x); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite(unsigned n, T x, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); template @@ -808,28 +855,28 @@ namespace boost const Policy&); template - std::complex >::result_type> cyl_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x); template - std::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol); template - std::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> cyl_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x); template - std::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> sph_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x); template - std::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> sph_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x); template BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x, const Policy&); @@ -936,7 +983,7 @@ namespace boost template struct expint_result { - typedef typename std::conditional< + typedef typename boost::math::conditional< policies::is_policy::value, tools::promote_args_t, typename tools::promote_args::type @@ -946,13 +993,13 @@ namespace boost } // namespace detail template - tools::promote_args_t expint(unsigned n, T z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t expint(unsigned n, T z, const Policy&); template - typename detail::expint_result::type expint(T const z, U const u); + BOOST_MATH_GPU_ENABLED typename detail::expint_result::type expint(T const z, U const u); template - tools::promote_args_t expint(T z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t expint(T z); // Zeta: template @@ -1346,7 +1393,7 @@ namespace boost laguerre(unsigned n, T1 m, T2 x) { return ::boost::math::laguerre(n, m, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ hermite(unsigned n, T x){ return ::boost::math::hermite(n, x, Policy()); }\ \ using boost::math::hermite_next;\ @@ -1620,11 +1667,11 @@ template \ using boost::math::changesign;\ \ template \ - inline typename boost::math::tools::promote_args_t expint(T const& z, U const& u)\ + BOOST_MATH_GPU_ENABLED inline typename boost::math::tools::promote_args_t expint(T const& z, U const& u)\ { return boost::math::expint(z, u, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t expint(T z){ return boost::math::expint(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t expint(T z){ return boost::math::expint(z, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t zeta(T s){ return boost::math::zeta(s, Policy()); }\ @@ -1669,19 +1716,19 @@ template \ inline boost::math::tools::promote_args_t owens_t(RT1 a, RT2 z){ return boost::math::owens_t(a, z, Policy()); }\ \ template \ - inline std::complex::result_type> cyl_hankel_1(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x)\ { return boost::math::cyl_hankel_1(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> cyl_hankel_2(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x)\ { return boost::math::cyl_hankel_2(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> sph_hankel_1(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x)\ { return boost::math::sph_hankel_1(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> sph_hankel_2(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x)\ { return boost::math::sph_hankel_2(v, x, Policy()); }\ \ template \ diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp index f93d2958d..ec5144011 100644 --- a/include/boost/math/tools/complex.hpp +++ b/include/boost/math/tools/complex.hpp @@ -14,11 +14,33 @@ #include #ifdef BOOST_MATH_ENABLE_CUDA + #include -#endif +#include + +namespace boost { +namespace math { + +template +using complex = cuda::std::complex; + +} // namespace math +} // namespace boost + +#else -#ifndef BOOST_MATH_HAS_NVRTC #include +#include + +namespace boost { +namespace math { + +template +using complex = std::complex; + +} // namespace math +} // namespace boost + #endif namespace boost { diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 99bc4ee27..b3a8c2cf8 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -678,6 +678,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 34ac3b965..d72501559 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -305,6 +305,14 @@ run test_cyl_neumann_double.cu ; run test_cyl_neumann_float.cu ; run test_sph_neumann_double.cu ; run test_sph_neumann_float.cu ; +run test_cyl_hankel_1_double.cu ; +run test_cyl_hankel_1_float.cu ; +run test_cyl_hankel_2_double.cu ; +run test_cyl_hankel_2_float.cu ; +run test_sph_hankel_1_double.cu ; +run test_sph_hankel_1_float.cu ; +run test_sph_hankel_2_double.cu ; +run test_sph_hankel_2_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; @@ -340,9 +348,18 @@ run test_erfc_float.cu ; run test_erfc_inv_double.cu ; run test_erfc_inv_float.cu ; +run test_expint_double.cu ; +run test_expint_float.cu ; + run test_expm1_double.cu ; run test_expm1_float.cu ; +run test_gegenbauer_double.cu ; +run test_gegenbauer_float.cu ; + +run test_hermite_double.cu ; +run test_hermite_float.cu ; + run test_lgamma_double.cu ; run test_lgamma_float.cu ; run test_tgamma_double.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index eb5da37c8..e049a24d2 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,9 +9,6 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_heumann_lambda_nvrtc_double.cpp ; -run test_heumann_lambda_nvrtc_float.cpp ; - # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; run test_exp_sinh_quad_nvrtc_double.cpp ; @@ -306,6 +303,14 @@ run test_cyl_neumann_nvrtc_double.cpp ; run test_cyl_neumann_nvrtc_float.cpp ; run test_sph_neumann_nvrtc_double.cpp ; run test_sph_neumann_nvrtc_float.cpp ; +run test_cyl_hankel_1_nvrtc_double.cpp ; +run test_cyl_hankel_1_nvrtc_float.cpp ; +run test_cyl_hankel_2_nvrtc_double.cpp ; +run test_cyl_hankel_2_nvrtc_float.cpp ; +run test_sph_hankel_1_nvrtc_double.cpp ; +run test_sph_hankel_1_nvrtc_float.cpp ; +run test_sph_hankel_2_nvrtc_double.cpp ; +run test_sph_hankel_2_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; @@ -335,6 +340,11 @@ run test_ellint_d_nvrtc_double.cpp ; run test_ellint_d_nvrtc_float.cpp ; run test_jacobi_zeta_nvrtc_double.cpp ; run test_jacobi_zeta_nvrtc_float.cpp ; +run test_heumann_lambda_nvrtc_double.cpp ; +run test_heumann_lambda_nvrtc_float.cpp ; + +run test_expint_nvrtc_double.cpp ; +run test_expint_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; @@ -351,6 +361,12 @@ run test_gamma_p_inv_nvrtc_float.cpp ; run test_tgamma_ratio_nvrtc_double.cpp ; run test_tgamma_ratio_nvrtc_float.cpp ; +run test_gegenbauer_nvrtc_double.cpp ; +run test_gegenbauer_nvrtc_float.cpp ; + +run test_hermite_nvrtc_double.cpp ; +run test_hermite_nvrtc_float.cpp ; + run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index ff6d84fde..582eaea40 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -71,8 +71,14 @@ run test_sign.cpp ; run test_round.cpp ; +run test_expint.cpp ; + run test_expm1_simple.cpp ; +run gegenbauer_test.cpp ; + +run test_hankel.cpp ; + run test_log1p_simple.cpp ; run test_digamma_simple.cpp ; @@ -85,3 +91,5 @@ run test_gamma.cpp ; run test_igamma.cpp ; run test_igamma_inv.cpp ; run test_igamma_inva.cpp ; + +run test_hermite.cpp ; diff --git a/test/test_cyl_hankel_1_double.cu b/test/test_cyl_hankel_1_double.cu new file mode 100644 index 000000000..134946934 --- /dev/null +++ b/test/test_cyl_hankel_1_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_1_float.cu b/test/test_cyl_hankel_1_float.cu new file mode 100644 index 000000000..da78c375c --- /dev/null +++ b/test/test_cyl_hankel_1_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_1_nvrtc_double.cpp b/test/test_cyl_hankel_1_nvrtc_double.cpp new file mode 100644 index 000000000..298436d06 --- /dev/null +++ b/test/test_cyl_hankel_1_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_1_nvrtc_float.cpp b/test/test_cyl_hankel_1_nvrtc_float.cpp new file mode 100644 index 000000000..d505c7bc4 --- /dev/null +++ b/test/test_cyl_hankel_1_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_2_double.cu b/test/test_cyl_hankel_2_double.cu new file mode 100644 index 000000000..55b643173 --- /dev/null +++ b/test/test_cyl_hankel_2_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_2_float.cu b/test/test_cyl_hankel_2_float.cu new file mode 100644 index 000000000..5766ebeb4 --- /dev/null +++ b/test/test_cyl_hankel_2_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_2_nvrtc_double.cpp b/test/test_cyl_hankel_2_nvrtc_double.cpp new file mode 100644 index 000000000..f7589d201 --- /dev/null +++ b/test/test_cyl_hankel_2_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_2_nvrtc_float.cpp b/test/test_cyl_hankel_2_nvrtc_float.cpp new file mode 100644 index 000000000..54216d39c --- /dev/null +++ b/test/test_cyl_hankel_2_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_expint.cpp b/test/test_expint.cpp index 3f44a8091..3eede5e38 100644 --- a/test/test_expint.cpp +++ b/test/test_expint.cpp @@ -3,7 +3,14 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + #include "test_expint.hpp" // @@ -78,7 +85,11 @@ void expected_results() ".*", // platform "float|double|long double", // test type(s) ".*Ei.*", // test data group + #ifndef SYCL_LANGUAGE_VERSION ".*", 6, 3); // test function + #else + ".*", 10, 3); + #endif if(std::numeric_limits::digits > 100) { add_expected_result( diff --git a/test/test_expint.hpp b/test/test_expint.hpp index 491db2fcd..d6524810e 100644 --- a/test/test_expint.hpp +++ b/test/test_expint.hpp @@ -4,13 +4,19 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #include +#include #define BOOST_TEST_MAIN #include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_expint_double.cu b/test/test_expint_double.cu new file mode 100644 index 000000000..d82e90a93 --- /dev/null +++ b/test/test_expint_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::expint(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::expint(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_expint_float.cu b/test/test_expint_float.cu new file mode 100644 index 000000000..dd1fccd1d --- /dev/null +++ b/test/test_expint_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::expint(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::expint(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_expint_nvrtc_double.cpp b/test/test_expint_nvrtc_double.cpp new file mode 100644 index 000000000..3ab45e6a1 --- /dev/null +++ b/test/test_expint_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::expint(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_expint_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::expint(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_expint_nvrtc_float.cpp b/test/test_expint_nvrtc_float.cpp new file mode 100644 index 000000000..bff58580e --- /dev/null +++ b/test/test_expint_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::expint(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_expint_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::expint(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gegenbauer_double.cu b/test/test_gegenbauer_double.cu new file mode 100644 index 000000000..21278d7a8 --- /dev/null +++ b/test/test_gegenbauer_double.cu @@ -0,0 +1,125 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Gegenbauer prime uses all methods internally so it's the easy choice + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]); + // Most elements are under 50 but extremely small numbers very more greatly + if (eps > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gegenbauer_float.cu b/test/test_gegenbauer_float.cu new file mode 100644 index 000000000..b7affaecd --- /dev/null +++ b/test/test_gegenbauer_float.cu @@ -0,0 +1,124 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Gegenbauer prime uses all methods internally so it's the easy choice + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]); + if (eps > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gegenbauer_nvrtc_double.cpp b/test/test_gegenbauer_nvrtc_double.cpp new file mode 100644 index 000000000..0c8416cb6 --- /dev/null +++ b/test/test_gegenbauer_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gegenbauer(2, in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gegenbauer_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gegenbauer_nvrtc_float.cpp b/test/test_gegenbauer_nvrtc_float.cpp new file mode 100644 index 000000000..c0d348417 --- /dev/null +++ b/test/test_gegenbauer_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gegenbauer(2, in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gegenbauer_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_hankel.cpp b/test/test_hankel.cpp index f8bd173da..a93e90c4d 100644 --- a/test/test_hankel.cpp +++ b/test/test_hankel.cpp @@ -3,9 +3,13 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#include + #define BOOST_TEST_MAIN #include #include @@ -85,6 +89,7 @@ void test_hankel(T, const char* name) // // Instantiate a few instances to check our error handling code can cope with std::complex: // +#ifndef SYCL_LANGUAGE_VERSION typedef boost::math::policies::policy< boost::math::policies::overflow_error, boost::math::policies::denorm_error, @@ -120,7 +125,7 @@ typedef boost::math::policies::policy< boost::math::policies::indeterminate_result_error > pol3; template std::complex boost::math::cyl_hankel_1(double, double, const pol3&); - +#endif BOOST_AUTO_TEST_CASE( test_main ) { diff --git a/test/test_hermite.cpp b/test/test_hermite.cpp index d1127feec..60dafdb8f 100644 --- a/test/test_hermite.cpp +++ b/test/test_hermite.cpp @@ -5,8 +5,15 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include -#include"test_hermite.hpp" +#endif + +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + +#include "test_hermite.hpp" // // DESCRIPTION: diff --git a/test/test_hermite.hpp b/test/test_hermite.hpp index 0b00677ee..8f7c55ff1 100644 --- a/test/test_hermite.hpp +++ b/test/test_hermite.hpp @@ -11,11 +11,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include #include "functor.hpp" diff --git a/test/test_hermite_double.cu b/test/test_hermite_double.cu new file mode 100644 index 000000000..a53766171 --- /dev/null +++ b/test/test_hermite_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::hermite(1U, input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_hermite_float.cu b/test/test_hermite_float.cu new file mode 100644 index 000000000..c48560bbe --- /dev/null +++ b/test/test_hermite_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::hermite(1U, input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_hermite_nvrtc_double.cpp b/test/test_hermite_nvrtc_double.cpp new file mode 100644 index 000000000..569d975cb --- /dev/null +++ b/test/test_hermite_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_hermite_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::hermite(1U, h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_hermite_nvrtc_float.cpp b/test/test_hermite_nvrtc_float.cpp new file mode 100644 index 000000000..e2e907c51 --- /dev/null +++ b/test/test_hermite_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_hermite_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::hermite(1U, h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_1_double.cu b/test/test_sph_hankel_1_double.cu new file mode 100644 index 000000000..ea9ec2306 --- /dev/null +++ b/test/test_sph_hankel_1_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_1_float.cu b/test/test_sph_hankel_1_float.cu new file mode 100644 index 000000000..4b01fe02a --- /dev/null +++ b/test/test_sph_hankel_1_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_1_nvrtc_double.cpp b/test/test_sph_hankel_1_nvrtc_double.cpp new file mode 100644 index 000000000..3ff1da8b2 --- /dev/null +++ b/test/test_sph_hankel_1_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_1_nvrtc_float.cpp b/test/test_sph_hankel_1_nvrtc_float.cpp new file mode 100644 index 000000000..0b0796653 --- /dev/null +++ b/test/test_sph_hankel_1_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_2_double.cu b/test/test_sph_hankel_2_double.cu new file mode 100644 index 000000000..6631f73a0 --- /dev/null +++ b/test/test_sph_hankel_2_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_2_float.cu b/test/test_sph_hankel_2_float.cu new file mode 100644 index 000000000..1910aef04 --- /dev/null +++ b/test/test_sph_hankel_2_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_2_nvrtc_double.cpp b/test/test_sph_hankel_2_nvrtc_double.cpp new file mode 100644 index 000000000..fa57fcbb1 --- /dev/null +++ b/test/test_sph_hankel_2_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_2_nvrtc_float.cpp b/test/test_sph_hankel_2_nvrtc_float.cpp new file mode 100644 index 000000000..be6fd0d09 --- /dev/null +++ b/test/test_sph_hankel_2_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}