diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp
index 07e112721..ce3f1e7cc 100644
--- a/include/boost/math/policies/error_handling.hpp
+++ b/include/boost/math/policies/error_handling.hpp
@@ -11,6 +11,7 @@
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 
 #ifndef BOOST_MATH_HAS_NVRTC
 
@@ -1017,7 +1018,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE R checked_narrowing_cast(T val, co
 }
 
 template <class T, class Policy>
-BOOST_MATH_GPU_ENABLED inline void check_series_iterations(const char* function, BOOST_MATH_UINTMAX_T max_iter, const Policy& pol) noexcept(boost::math::is_floating_point_v<T>)
+BOOST_MATH_GPU_ENABLED inline void check_series_iterations(const char* function, boost::math::uintmax_t max_iter, const Policy& pol) noexcept(boost::math::is_floating_point_v<T>)
 {
    if(max_iter >= policies::get_max_series_iterations<Policy>())
       raise_evaluation_error<T>(
@@ -1026,7 +1027,7 @@ BOOST_MATH_GPU_ENABLED inline void check_series_iterations(const char* function,
 }
 
 template <class T, class Policy>
-BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, BOOST_MATH_UINTMAX_T max_iter, const Policy& pol) noexcept(boost::math::is_floating_point_v<T>)
+BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, boost::math::uintmax_t max_iter, const Policy& pol) noexcept(boost::math::is_floating_point_v<T>)
 {
    if(max_iter >= policies::get_max_root_iterations<Policy>())
       raise_evaluation_error<T>(
diff --git a/include/boost/math/policies/policy.hpp b/include/boost/math/policies/policy.hpp
index 432191362..ec7b36f2d 100644
--- a/include/boost/math/policies/policy.hpp
+++ b/include/boost/math/policies/policy.hpp
@@ -11,6 +11,7 @@
 #include <boost/math/tools/mp.hpp>
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 
 namespace boost{ namespace math{
 
@@ -313,7 +314,7 @@ class is_default_policy
    };
 };
 
-template <class Seq, class T, BOOST_MATH_SIZE_T N>
+template <class Seq, class T, boost::math::size_t N>
 struct append_N
 {
    using type = typename append_N<mp::mp_push_back<Seq, T>, T, N-1>::type;
@@ -402,7 +403,7 @@ class policy
    // Typelist of the arguments:
    //
    using arg_list = mp::mp_list<A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13>;
-   static constexpr BOOST_MATH_SIZE_T arg_list_size = mp::mp_size<arg_list>::value;
+   static constexpr boost::math::size_t arg_list_size = mp::mp_size<arg_list>::value;
 
    template<typename A, typename B, bool b>
    struct pick_arg
@@ -533,7 +534,7 @@ class normalise
 {
 private:
    using arg_list = mp::mp_list<A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13>;
-   static constexpr BOOST_MATH_SIZE_T arg_list_size = mp::mp_size<arg_list>::value;
+   static constexpr boost::math::size_t arg_list_size = mp::mp_size<arg_list>::value;
 
    template<typename A, typename B, bool b>
    struct pick_arg
@@ -882,7 +883,7 @@ struct series_factor_calc<T, Digits, boost::math::true_type, boost::math::false_
 {
    BOOST_MATH_GPU_ENABLED static constexpr T get() noexcept(boost::math::is_floating_point<T>::value)
    {
-      return 1 / static_cast<T>(static_cast<BOOST_MATH_UINTMAX_T>(1u) << (Digits::value - 1));
+      return 1 / static_cast<T>(static_cast<boost::math::uintmax_t>(1u) << (Digits::value - 1));
    }
 };
 template <class T, class Digits>
@@ -901,7 +902,7 @@ BOOST_MATH_GPU_ENABLED constexpr T get_epsilon_imp(boost::math::true_type const&
    static_assert(boost::math::numeric_limits<T>::radix == 2, "boost::math::numeric_limits<T>::radix == 2");
 
    typedef typename boost::math::policies::precision<T, Policy>::type p_t;
-   typedef boost::math::integral_constant<bool, p_t::value <= boost::math::numeric_limits<BOOST_MATH_UINTMAX_T>::digits> is_small_int;
+   typedef boost::math::integral_constant<bool, p_t::value <= boost::math::numeric_limits<boost::math::uintmax_t>::digits> is_small_int;
    typedef boost::math::integral_constant<bool, p_t::value >= boost::math::numeric_limits<T>::digits> is_default_value;
    return series_factor_calc<T, p_t, is_small_int, is_default_value>::get();
 }
diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp
index 75968f3eb..f0ac6318b 100644
--- a/include/boost/math/special_functions/beta.hpp
+++ b/include/boost/math/special_functions/beta.hpp
@@ -11,19 +11,31 @@
 #pragma once
 #endif
 
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/assert.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <boost/math/special_functions/gamma.hpp>
-#include <boost/math/special_functions/binomial.hpp>
-#include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/special_functions/erf.hpp>
 #include <boost/math/special_functions/log1p.hpp>
 #include <boost/math/special_functions/expm1.hpp>
 #include <boost/math/special_functions/trunc.hpp>
+#include <boost/math/special_functions/lanczos.hpp>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/constants/constants.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/binomial.hpp>
+#include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/tools/roots.hpp>
-#include <boost/math/tools/assert.hpp>
 #include <cmath>
+#endif
 
 namespace boost{ namespace math{
 
@@ -124,6 +136,7 @@ BOOST_MATH_GPU_ENABLED T beta_imp(T a, T b, const Lanczos&, const Policy& pol)
 // Generic implementation of Beta(a,b) without Lanczos approximation support
 // (Caution this is slow!!!):
 //
+#ifndef BOOST_MATH_HAS_NVRTC
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T beta_imp(T a, T b, const lanczos::undefined_lanczos& l, const Policy& pol)
 {
@@ -194,7 +207,7 @@ BOOST_MATH_GPU_ENABLED T beta_imp(T a, T b, const lanczos::undefined_lanczos& l,
    }
 
 } // template <class T>T beta_imp(T a, T b, const lanczos::undefined_lanczos& l)
-
+#endif
 
 //
 // Compute the leading power terms in the incomplete Beta:
@@ -246,11 +259,11 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
    // l1 and l2 are the base of the exponents minus one:
    T l1 = (x * b - y * agh) / agh;
    T l2 = (y * a - x * bgh) / bgh;
-   if(((std::min)(fabs(l1), fabs(l2)) < 0.2))
+   if((BOOST_MATH_GPU_SAFE_MIN(fabs(l1), fabs(l2)) < 0.2))
    {
       // when the base of the exponent is very near 1 we get really
       // gross errors unless extra care is taken:
-      if((l1 * l2 > 0) || ((std::min)(a, b) < 1))
+      if((l1 * l2 > 0) || (BOOST_MATH_GPU_SAFE_MIN(a, b) < 1))
       {
          //
          // This first branch handles the simple cases where either:
@@ -286,7 +299,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
             BOOST_MATH_INSTRUMENT_VARIABLE(result);
          }
       }
-      else if((std::max)(fabs(l1), fabs(l2)) < 0.5)
+      else if(BOOST_MATH_GPU_SAFE_MAX(fabs(l1), fabs(l2)) < 0.5)
       {
          //
          // Both exponents are near one and both the exponents are
@@ -448,6 +461,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
 //
 // This version is generic, slow, and does not use the Lanczos approximation.
 //
+#ifndef BOOST_MATH_HAS_NVRTC
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
                         T b,
@@ -484,7 +498,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
       bool need_logs = false;
       if (a < b)
       {
-         BOOST_MATH_IF_CONSTEXPR(std::numeric_limits<T>::has_infinity)
+         BOOST_MATH_IF_CONSTEXPR(boost::math::numeric_limits<T>::has_infinity)
          {
             power1 = pow((x * y * c * c) / (a * b), a);
             power2 = pow((y * c) / b, b - a);
@@ -507,7 +521,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
       }
       else
       {
-         BOOST_MATH_IF_CONSTEXPR(std::numeric_limits<T>::has_infinity)
+         BOOST_MATH_IF_CONSTEXPR(boost::math::numeric_limits<T>::has_infinity)
          {
             power1 = pow((x * y * c * c) / (a * b), b);
             power2 = pow((x * c) / a, a - b);
@@ -526,7 +540,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
                need_logs = true;
          }
       }
-      BOOST_MATH_IF_CONSTEXPR(std::numeric_limits<T>::has_infinity)
+      BOOST_MATH_IF_CONSTEXPR(boost::math::numeric_limits<T>::has_infinity)
       {
          if (!(boost::math::isnormal)(power1) || !(boost::math::isnormal)(power2))
          {
@@ -625,6 +639,8 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a,
    }
    return prefix * power1 * (power2 / bet);
 }
+
+#endif
 //
 // Series approximation to the incomplete beta:
 //
@@ -717,7 +733,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool
    if(result < tools::min_value<T>())
       return s0; // Safeguard: series can't cope with denorms.
    ibeta_series_t<T> s(a, b, x, result);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
    result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter, s0);
    policies::check_series_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (with lanczos)", max_iter, pol);
    return result;
@@ -725,6 +741,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool
 //
 // Incomplete Beta series again, this time without Lanczos support:
 //
+#ifndef BOOST_MATH_HAS_NVRTC
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::lanczos::undefined_lanczos& l, bool normalised, T* p_derivative, T y, const Policy& pol)
 {
@@ -778,7 +795,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la
    if(result < tools::min_value<T>())
       return s0; // Safeguard: series can't cope with denorms.
    ibeta_series_t<T> s(a, b, x, result);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
    result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter, s0);
    policies::check_series_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol);
    return result;
@@ -790,7 +807,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la
 template <class T>
 struct ibeta_fraction2_t
 {
-   typedef std::pair<T, T> result_type;
+   typedef boost::math::pair<T, T> result_type;
 
    BOOST_MATH_GPU_ENABLED ibeta_fraction2_t(T a_, T b_, T x_, T y_) : a(a_), b(b_), x(x_), y(y_), m(0) {}
 
@@ -806,7 +823,7 @@ struct ibeta_fraction2_t
 
       ++m;
 
-      return std::make_pair(aN, bN);
+      return boost::math::make_pair(aN, bN);
    }
 
 private:
@@ -867,6 +884,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p
 
    return prefix;
 }
+#endif
 //
 // This function is only needed for the non-regular incomplete beta,
 // it computes the delta in:
@@ -905,11 +923,15 @@ struct Pn_size
 {
    // This is likely to be enough for ~35-50 digit accuracy
    // but it's hard to quantify exactly:
+   #ifndef BOOST_MATH_HAS_NVRTC
    static constexpr unsigned value =
       ::boost::math::max_factorial<T>::value >= 100 ? 50
    : ::boost::math::max_factorial<T>::value >= ::boost::math::max_factorial<double>::value ? 30
    : ::boost::math::max_factorial<T>::value >= ::boost::math::max_factorial<float>::value ? 15 : 1;
    static_assert(::boost::math::max_factorial<T>::value >= ::boost::math::max_factorial<float>::value, "Type does not provide for 35-50 digits of accuracy.");
+   #else
+   static constexpr unsigned value = 0; // Will never be called
+   #endif
 };
 template <>
 struct Pn_size<float>
@@ -936,6 +958,7 @@ struct Pn_size<long double>
 #endif
 };
 
+#ifndef BOOST_MATH_HAS_NVRTC
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
 {
@@ -1037,7 +1060,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T
    }
    return sum;
 } // template <class T, class Lanczos>T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised)
-
+#endif
 //
 // For integer arguments we can relate the incomplete beta to the
 // complement of the binomial distribution cdf and use this finite sum.
@@ -1224,7 +1247,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, b
       return p;
    }
 
-   if((std::min)(a, b) <= 1)
+   if(BOOST_MATH_GPU_SAFE_MIN(a, b) <= 1)
    {
       if(x > 0.5)
       {
@@ -1233,10 +1256,10 @@ BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, b
          invert = !invert;
          BOOST_MATH_INSTRUMENT_VARIABLE(invert);
       }
-      if((std::max)(a, b) <= 1)
+      if(BOOST_MATH_GPU_SAFE_MAX(a, b) <= 1)
       {
          // Both a,b < 1:
-         if((a >= (std::min)(T(0.2), b)) || (pow(x, a) <= 0.9))
+         if((a >= BOOST_MATH_GPU_SAFE_MIN(T(0.2), b)) || (pow(x, a) <= 0.9))
          {
             if(!invert)
             {
@@ -1405,7 +1428,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, b
 
       if(b < 40)
       {
-         if((floor(a) == a) && (floor(b) == b) && (a < static_cast<T>((std::numeric_limits<int>::max)() - 100)) && (y != 1))
+         if((floor(a) == a) && (floor(b) == b) && (a < static_cast<T>((boost::math::numeric_limits<int>::max)() - 100)) && (y != 1))
          {
             // relate to the binomial distribution and use a finite sum:
             T k = a - 1;
@@ -1726,7 +1749,12 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
 } // namespace math
 } // namespace boost
 
+// TODO(mborland): Get the ibeta_inv working on NVRTC
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <boost/math/special_functions/detail/ibeta_inverse.hpp>
 #include <boost/math/special_functions/detail/ibeta_inv_ab.hpp>
 
+#endif
+
 #endif // BOOST_MATH_SPECIAL_BETA_HPP
diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 0d5e5f0b2..afb8e9728 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -14,9 +14,10 @@
 #pragma once
 #endif
 
-#ifndef __CUDACC_RTC__
-
 #include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/fraction.hpp>
 #include <boost/math/tools/precision.hpp>
diff --git a/include/boost/math/special_functions/lanczos.hpp b/include/boost/math/special_functions/lanczos.hpp
index b804bf829..0ec24bddb 100644
--- a/include/boost/math/special_functions/lanczos.hpp
+++ b/include/boost/math/special_functions/lanczos.hpp
@@ -11,12 +11,16 @@
 #endif
 
 #include <boost/math/tools/config.hpp>
-#include <boost/math/tools/big_constant.hpp>
 #include <boost/math/tools/rational.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <boost/math/policies/policy.hpp>
-#include <limits>
-#include <type_traits>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/tools/big_constant.hpp>
 #include <cstdint>
+#endif
 
 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -59,7 +63,7 @@ BOOST_MATH_GPU_ENABLED inline double lanczos_g_near_1_and_2(const L&)
 // Max experimental error (with arbitrary precision arithmetic) 9.516e-12
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos6 : public std::integral_constant<int, 35>
+struct lanczos6 : public boost::math::integral_constant<int, 35>
 {
    //
    // Produces slightly better than float precision when evaluated at
@@ -77,13 +81,13 @@ struct lanczos6 : public std::integral_constant<int, 35>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 63.99951844938187085666201263218840287667)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 2.506628274631006311133031631822390264407))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint16_t) denom[6] = {
-         static_cast<std::uint16_t>(0u),
-         static_cast<std::uint16_t>(24u),
-         static_cast<std::uint16_t>(50u),
-         static_cast<std::uint16_t>(35u),
-         static_cast<std::uint16_t>(10u),
-         static_cast<std::uint16_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint16_t) denom[6] = {
+         static_cast<boost::math::uint16_t>(0u),
+         static_cast<boost::math::uint16_t>(24u),
+         static_cast<boost::math::uint16_t>(50u),
+         static_cast<boost::math::uint16_t>(35u),
+         static_cast<boost::math::uint16_t>(10u),
+         static_cast<boost::math::uint16_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -101,13 +105,13 @@ struct lanczos6 : public std::integral_constant<int, 35>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 0.2412010548258800231126240760264822486599)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 0.009446967704539249494420221613134244048319))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint16_t) denom[6] = {
-         static_cast<std::uint16_t>(0u),
-         static_cast<std::uint16_t>(24u),
-         static_cast<std::uint16_t>(50u),
-         static_cast<std::uint16_t>(35u),
-         static_cast<std::uint16_t>(10u),
-         static_cast<std::uint16_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint16_t) denom[6] = {
+         static_cast<boost::math::uint16_t>(0u),
+         static_cast<boost::math::uint16_t>(24u),
+         static_cast<boost::math::uint16_t>(50u),
+         static_cast<boost::math::uint16_t>(35u),
+         static_cast<boost::math::uint16_t>(10u),
+         static_cast<boost::math::uint16_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -163,7 +167,7 @@ struct lanczos6 : public std::integral_constant<int, 35>
 // Max experimental error (with arbitrary precision arithmetic) 2.16676e-19
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos11 : public std::integral_constant<int, 60>
+struct lanczos11 : public boost::math::integral_constant<int, 60>
 {
    //
    // Produces slightly better than double precision when evaluated at
@@ -186,18 +190,18 @@ struct lanczos11 : public std::integral_constant<int, 60>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 261.6140441641668190791708576058805625502)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 2.506628274631000502415573855452633787834))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[11] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(362880u),
-         static_cast<std::uint32_t>(1026576u),
-         static_cast<std::uint32_t>(1172700u),
-         static_cast<std::uint32_t>(723680u),
-         static_cast<std::uint32_t>(269325u),
-         static_cast<std::uint32_t>(63273u),
-         static_cast<std::uint32_t>(9450u),
-         static_cast<std::uint32_t>(870u),
-         static_cast<std::uint32_t>(45u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[11] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(362880u),
+         static_cast<boost::math::uint32_t>(1026576u),
+         static_cast<boost::math::uint32_t>(1172700u),
+         static_cast<boost::math::uint32_t>(723680u),
+         static_cast<boost::math::uint32_t>(269325u),
+         static_cast<boost::math::uint32_t>(63273u),
+         static_cast<boost::math::uint32_t>(9450u),
+         static_cast<boost::math::uint32_t>(870u),
+         static_cast<boost::math::uint32_t>(45u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -220,18 +224,18 @@ struct lanczos11 : public std::integral_constant<int, 60>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 0.004826466289237661857584712046231435101741)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 0.4624429436045378766270459638520555557321e-4))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[11] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(362880u),
-         static_cast<std::uint32_t>(1026576u),
-         static_cast<std::uint32_t>(1172700u),
-         static_cast<std::uint32_t>(723680u),
-         static_cast<std::uint32_t>(269325u),
-         static_cast<std::uint32_t>(63273u),
-         static_cast<std::uint32_t>(9450u),
-         static_cast<std::uint32_t>(870u),
-         static_cast<std::uint32_t>(45u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[11] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(362880u),
+         static_cast<boost::math::uint32_t>(1026576u),
+         static_cast<boost::math::uint32_t>(1172700u),
+         static_cast<boost::math::uint32_t>(723680u),
+         static_cast<boost::math::uint32_t>(269325u),
+         static_cast<boost::math::uint32_t>(63273u),
+         static_cast<boost::math::uint32_t>(9450u),
+         static_cast<boost::math::uint32_t>(870u),
+         static_cast<boost::math::uint32_t>(45u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -297,7 +301,7 @@ struct lanczos11 : public std::integral_constant<int, 60>
 // Max experimental error (with arbitrary precision arithmetic) 9.2213e-23
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos13 : public std::integral_constant<int, 72>
+struct lanczos13 : public boost::math::integral_constant<int, 72>
 {
    //
    // Produces slightly better than extended-double precision when evaluated at
@@ -322,20 +326,20 @@ struct lanczos13 : public std::integral_constant<int, 72>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 381.8801248632926870394389468349331394196)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 2.506628274631000502415763426076722427007))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[13] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(39916800u),
-         static_cast<std::uint32_t>(120543840u),
-         static_cast<std::uint32_t>(150917976u),
-         static_cast<std::uint32_t>(105258076u),
-         static_cast<std::uint32_t>(45995730u),
-         static_cast<std::uint32_t>(13339535u),
-         static_cast<std::uint32_t>(2637558u),
-         static_cast<std::uint32_t>(357423u),
-         static_cast<std::uint32_t>(32670u),
-         static_cast<std::uint32_t>(1925u),
-         static_cast<std::uint32_t>(66u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[13] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(39916800u),
+         static_cast<boost::math::uint32_t>(120543840u),
+         static_cast<boost::math::uint32_t>(150917976u),
+         static_cast<boost::math::uint32_t>(105258076u),
+         static_cast<boost::math::uint32_t>(45995730u),
+         static_cast<boost::math::uint32_t>(13339535u),
+         static_cast<boost::math::uint32_t>(2637558u),
+         static_cast<boost::math::uint32_t>(357423u),
+         static_cast<boost::math::uint32_t>(32670u),
+         static_cast<boost::math::uint32_t>(1925u),
+         static_cast<boost::math::uint32_t>(66u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -360,20 +364,20 @@ struct lanczos13 : public std::integral_constant<int, 72>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 0.0007469903808915448316510079585999893674101)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 0.4903180573459871862552197089738373164184e-5))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[13] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(39916800u),
-         static_cast<std::uint32_t>(120543840u),
-         static_cast<std::uint32_t>(150917976u),
-         static_cast<std::uint32_t>(105258076u),
-         static_cast<std::uint32_t>(45995730u),
-         static_cast<std::uint32_t>(13339535u),
-         static_cast<std::uint32_t>(2637558u),
-         static_cast<std::uint32_t>(357423u),
-         static_cast<std::uint32_t>(32670u),
-         static_cast<std::uint32_t>(1925u),
-         static_cast<std::uint32_t>(66u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[13] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(39916800u),
+         static_cast<boost::math::uint32_t>(120543840u),
+         static_cast<boost::math::uint32_t>(150917976u),
+         static_cast<boost::math::uint32_t>(105258076u),
+         static_cast<boost::math::uint32_t>(45995730u),
+         static_cast<boost::math::uint32_t>(13339535u),
+         static_cast<boost::math::uint32_t>(2637558u),
+         static_cast<boost::math::uint32_t>(357423u),
+         static_cast<boost::math::uint32_t>(32670u),
+         static_cast<boost::math::uint32_t>(1925u),
+         static_cast<boost::math::uint32_t>(66u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -443,7 +447,7 @@ struct lanczos13 : public std::integral_constant<int, 72>
 // Max experimental error (with arbitrary precision arithmetic) 8.111667e-8
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos6m24 : public std::integral_constant<int, 24>
+struct lanczos6m24 : public boost::math::integral_constant<int, 24>
 {
    //
    // Use for float precision, when evaluated as a float:
@@ -460,13 +464,13 @@ struct lanczos6m24 : public std::integral_constant<int, 24>
          static_cast<T>(27.5192015197455403062503721613097825345L),
          static_cast<T>(2.50662858515256974113978724717473206342L)
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint16_t) denom[6] = {
-         static_cast<std::uint16_t>(0u),
-         static_cast<std::uint16_t>(24u),
-         static_cast<std::uint16_t>(50u),
-         static_cast<std::uint16_t>(35u),
-         static_cast<std::uint16_t>(10u),
-         static_cast<std::uint16_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint16_t) denom[6] = {
+         static_cast<boost::math::uint16_t>(0u),
+         static_cast<boost::math::uint16_t>(24u),
+         static_cast<boost::math::uint16_t>(50u),
+         static_cast<boost::math::uint16_t>(35u),
+         static_cast<boost::math::uint16_t>(10u),
+         static_cast<boost::math::uint16_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -484,13 +488,13 @@ struct lanczos6m24 : public std::integral_constant<int, 24>
          static_cast<T>(6.595765571169314946316366571954421695196L),
          static_cast<T>(0.6007854010515290065101128585795542383721L)
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint16_t) denom[6] = {
-         static_cast<std::uint16_t>(0u),
-         static_cast<std::uint16_t>(24u),
-         static_cast<std::uint16_t>(50u),
-         static_cast<std::uint16_t>(35u),
-         static_cast<std::uint16_t>(10u),
-         static_cast<std::uint16_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint16_t) denom[6] = {
+         static_cast<boost::math::uint16_t>(0u),
+         static_cast<boost::math::uint16_t>(24u),
+         static_cast<boost::math::uint16_t>(50u),
+         static_cast<boost::math::uint16_t>(35u),
+         static_cast<boost::math::uint16_t>(10u),
+         static_cast<boost::math::uint16_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -546,7 +550,7 @@ struct lanczos6m24 : public std::integral_constant<int, 24>
 // Max experimental error (with arbitrary precision arithmetic) 1.196214e-17
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos13m53 : public std::integral_constant<int, 53>
+struct lanczos13m53 : public boost::math::integral_constant<int, 53>
 {
    //
    // Use for double precision, when evaluated as a double:
@@ -570,20 +574,20 @@ struct lanczos13m53 : public std::integral_constant<int, 53>
          static_cast<T>(210.8242777515793458725097339207133627117L),
          static_cast<T>(2.506628274631000270164908177133837338626L)
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[13] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(39916800u),
-         static_cast<std::uint32_t>(120543840u),
-         static_cast<std::uint32_t>(150917976u),
-         static_cast<std::uint32_t>(105258076u),
-         static_cast<std::uint32_t>(45995730u),
-         static_cast<std::uint32_t>(13339535u),
-         static_cast<std::uint32_t>(2637558u),
-         static_cast<std::uint32_t>(357423u),
-         static_cast<std::uint32_t>(32670u),
-         static_cast<std::uint32_t>(1925u),
-         static_cast<std::uint32_t>(66u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[13] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(39916800u),
+         static_cast<boost::math::uint32_t>(120543840u),
+         static_cast<boost::math::uint32_t>(150917976u),
+         static_cast<boost::math::uint32_t>(105258076u),
+         static_cast<boost::math::uint32_t>(45995730u),
+         static_cast<boost::math::uint32_t>(13339535u),
+         static_cast<boost::math::uint32_t>(2637558u),
+         static_cast<boost::math::uint32_t>(357423u),
+         static_cast<boost::math::uint32_t>(32670u),
+         static_cast<boost::math::uint32_t>(1925u),
+         static_cast<boost::math::uint32_t>(66u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -608,20 +612,20 @@ struct lanczos13m53 : public std::integral_constant<int, 53>
          static_cast<T>(0.5098416655656676188125178644804694509993L),
          static_cast<T>(0.006061842346248906525783753964555936883222L)
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint32_t) denom[13] = {
-         static_cast<std::uint32_t>(0u),
-         static_cast<std::uint32_t>(39916800u),
-         static_cast<std::uint32_t>(120543840u),
-         static_cast<std::uint32_t>(150917976u),
-         static_cast<std::uint32_t>(105258076u),
-         static_cast<std::uint32_t>(45995730u),
-         static_cast<std::uint32_t>(13339535u),
-         static_cast<std::uint32_t>(2637558u),
-         static_cast<std::uint32_t>(357423u),
-         static_cast<std::uint32_t>(32670u),
-         static_cast<std::uint32_t>(1925u),
-         static_cast<std::uint32_t>(66u),
-         static_cast<std::uint32_t>(1u)
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint32_t) denom[13] = {
+         static_cast<boost::math::uint32_t>(0u),
+         static_cast<boost::math::uint32_t>(39916800u),
+         static_cast<boost::math::uint32_t>(120543840u),
+         static_cast<boost::math::uint32_t>(150917976u),
+         static_cast<boost::math::uint32_t>(105258076u),
+         static_cast<boost::math::uint32_t>(45995730u),
+         static_cast<boost::math::uint32_t>(13339535u),
+         static_cast<boost::math::uint32_t>(2637558u),
+         static_cast<boost::math::uint32_t>(357423u),
+         static_cast<boost::math::uint32_t>(32670u),
+         static_cast<boost::math::uint32_t>(1925u),
+         static_cast<boost::math::uint32_t>(66u),
+         static_cast<boost::math::uint32_t>(1u)
       };
       // LCOV_EXCL_STOP
       return boost::math::tools::evaluate_rational(num, denom, z);
@@ -691,7 +695,7 @@ struct lanczos13m53 : public std::integral_constant<int, 53>
 // Max experimental error (with arbitrary precision arithmetic) 2.7699e-26
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos17m64 : public std::integral_constant<int, 64>
+struct lanczos17m64 : public boost::math::integral_constant<int, 64>
 {
    //
    // Use for extended-double precision, when evaluated as an extended-double:
@@ -719,7 +723,7 @@ struct lanczos17m64 : public std::integral_constant<int, 64>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 488.0063567520005730476791712814838113252)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.50662827463100050241576877135758834683))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint64_t) denom[17] = {
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint64_t) denom[17] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(1307674368000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(4339163001600, uLL),
@@ -765,7 +769,7 @@ struct lanczos17m64 : public std::integral_constant<int, 64>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.002393749522058449186690627996063983095463)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1229541408909435212800785616808830746135e-4))
       };
-      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, std::uint64_t) denom[17] = {
+      BOOST_MATH_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::math::uint64_t) denom[17] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(1307674368000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(4339163001600, uLL),
@@ -860,13 +864,13 @@ struct lanczos17m64 : public std::integral_constant<int, 64>
 // Max experimental error (with arbitrary precision arithmetic) 1.0541e-38
 // Generated with compiler: Microsoft Visual C++ version 8.0 on Win32 at Mar 23 2006
 //
-struct lanczos24m113 : public std::integral_constant<int, 113>
+struct lanczos24m113 : public boost::math::integral_constant<int, 113>
 {
    //
    // Use for long-double precision, when evaluated as an long-double:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[24] = {
@@ -926,7 +930,7 @@ struct lanczos24m113 : public std::integral_constant<int, 113>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[24] = {
@@ -987,7 +991,7 @@ struct lanczos24m113 : public std::integral_constant<int, 113>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[23] = {
@@ -1025,7 +1029,7 @@ struct lanczos24m113 : public std::integral_constant<int, 113>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[23] = {
@@ -1063,7 +1067,7 @@ struct lanczos24m113 : public std::integral_constant<int, 113>
       return result;
    }
 
-   static double g(){ return 20.3209821879863739013671875; }
+   BOOST_MATH_GPU_ENABLED static double g(){ return 20.3209821879863739013671875; }
 };
 
 //
@@ -1072,10 +1076,10 @@ struct lanczos24m113 : public std::integral_constant<int, 113>
 // Generated with compiler: Microsoft Visual C++ version 14.2 on Win32 at May 23 2021
 // Type precision was 134 bits or 42 max_digits10
 //
-struct lanczos27MP : public std::integral_constant<int, 134>
+struct lanczos27MP : public boost::math::integral_constant<int, 134>
 {
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[27] = {
@@ -1141,7 +1145,7 @@ struct lanczos27MP : public std::integral_constant<int, 134>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[27] = {
@@ -1208,7 +1212,7 @@ struct lanczos27MP : public std::integral_constant<int, 134>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[34] = {
@@ -1257,7 +1261,7 @@ struct lanczos27MP : public std::integral_constant<int, 134>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[34] = {
@@ -1306,10 +1310,10 @@ struct lanczos27MP : public std::integral_constant<int, 134>
       return result;
    }
 
-   static double g() { return 2.472513680905104038743047567550092935562134e+01; }
+   BOOST_MATH_GPU_ENABLED static double g() { return 2.472513680905104038743047567550092935562134e+01; }
 };
 
-inline double lanczos_g_near_1_and_2(const lanczos27MP&)
+BOOST_MATH_GPU_ENABLED inline double lanczos_g_near_1_and_2(const lanczos27MP&)
 {
    return 17.03623256087303;
 }
@@ -1320,10 +1324,10 @@ inline double lanczos_g_near_1_and_2(const lanczos27MP&)
 // Generated with compiler: Microsoft Visual C++ version 14.2 on Win32 at Oct 14 2019
 // Type precision was 168 bits or 53 max_digits10
 //
-struct lanczos35MP : public std::integral_constant<int, 168>
+struct lanczos35MP : public boost::math::integral_constant<int, 168>
 {
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[35] = {
@@ -1405,7 +1409,7 @@ struct lanczos35MP : public std::integral_constant<int, 168>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[35] = {
@@ -1488,7 +1492,7 @@ struct lanczos35MP : public std::integral_constant<int, 168>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[42] = {
@@ -1545,7 +1549,7 @@ struct lanczos35MP : public std::integral_constant<int, 168>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[42] = {
@@ -1602,10 +1606,10 @@ struct lanczos35MP : public std::integral_constant<int, 168>
       return result;
    }
 
-   static double g() { return 2.96640371531248092651367187500000000000000000000000000e+01; }
+   BOOST_MATH_GPU_ENABLED static double g() { return 2.96640371531248092651367187500000000000000000000000000e+01; }
 };
 
-inline double lanczos_g_near_1_and_2(const lanczos35MP&)
+BOOST_MATH_GPU_ENABLED inline double lanczos_g_near_1_and_2(const lanczos35MP&)
 {
    return 22.36563469469547;
 }
@@ -1615,10 +1619,10 @@ inline double lanczos_g_near_1_and_2(const lanczos35MP&)
 // Generated with compiler: Microsoft Visual C++ version 14.2 on Win32 at Oct 14 2019
 // Type precision was 201 bits or 63 max_digits10
 //
-struct lanczos48MP : public std::integral_constant<int, 201>
+struct lanczos48MP : public boost::math::integral_constant<int, 201>
 {
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[48] = {
@@ -1726,7 +1730,7 @@ struct lanczos48MP : public std::integral_constant<int, 201>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[48] = {
@@ -1835,7 +1839,7 @@ struct lanczos48MP : public std::integral_constant<int, 201>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[47] = {
@@ -1897,7 +1901,7 @@ struct lanczos48MP : public std::integral_constant<int, 201>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[47] = {
@@ -1959,7 +1963,7 @@ struct lanczos48MP : public std::integral_constant<int, 201>
       return result;
    }
 
-   static double g() { return 2.880805098265409469604492187500000000000000000000000000000000000e+01; }
+   BOOST_MATH_GPU_ENABLED static double g() { return 2.880805098265409469604492187500000000000000000000000000000000000e+01; }
 };
 //
 // Lanczos Coefficients for N=49 G=3.531905273437499914734871708787977695465087890625000000000000000000000000e+01
@@ -1967,10 +1971,10 @@ struct lanczos48MP : public std::integral_constant<int, 201>
 // Generated with compiler: Microsoft Visual C++ version 14.2 on Win32 at May 23 2021
 // Type precision was 234 bits or 72 max_digits10
 //
-struct lanczos49MP : public std::integral_constant<int, 234>
+struct lanczos49MP : public boost::math::integral_constant<int, 234>
 {
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[49] = {
@@ -2080,7 +2084,7 @@ struct lanczos49MP : public std::integral_constant<int, 234>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[49] = {
@@ -2191,7 +2195,7 @@ struct lanczos49MP : public std::integral_constant<int, 234>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[48] = {
@@ -2254,7 +2258,7 @@ struct lanczos49MP : public std::integral_constant<int, 234>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[48] = {
@@ -2317,10 +2321,10 @@ struct lanczos49MP : public std::integral_constant<int, 234>
       return result;
    }
 
-   static double g() { return 3.531905273437499914734871708787977695465087890625000000000000000000000000e+01; }
+   BOOST_MATH_GPU_ENABLED static double g() { return 3.531905273437499914734871708787977695465087890625000000000000000000000000e+01; }
 };
 
-inline double lanczos_g_near_1_and_2(const lanczos49MP&)
+BOOST_MATH_GPU_ENABLED inline double lanczos_g_near_1_and_2(const lanczos49MP&)
 {
    return 33.54638671875000;
 }
@@ -2331,10 +2335,10 @@ inline double lanczos_g_near_1_and_2(const lanczos49MP&)
 // Generated with compiler: Microsoft Visual C++ version 14.2 on Win32 at May 22 2021
 // Type precision was 267 bits or 82 max_digits10
 //
-struct lanczos52MP : public std::integral_constant<int, 267>
+struct lanczos52MP : public boost::math::integral_constant<int, 267>
 {
    template <class T>
-   static T lanczos_sum(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[52] = {
@@ -2450,7 +2454,7 @@ struct lanczos52MP : public std::integral_constant<int, 267>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_expG_scaled(const T& z)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T num[52] = {
@@ -2567,7 +2571,7 @@ struct lanczos52MP : public std::integral_constant<int, 267>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_1(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[56] = {
@@ -2638,7 +2642,7 @@ struct lanczos52MP : public std::integral_constant<int, 267>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   BOOST_MATH_GPU_ENABLED static T lanczos_sum_near_2(const T& dz)
    {
       // LCOV_EXCL_START
       BOOST_MATH_STATIC const T d[56] = {
@@ -2709,10 +2713,10 @@ struct lanczos52MP : public std::integral_constant<int, 267>
       return result;
    }
 
-   static double g() { return 4.9921416015624998863131622783839702606201171875000000000000000000000000000000000000e+01; }
+   BOOST_MATH_GPU_ENABLED static double g() { return 4.9921416015624998863131622783839702606201171875000000000000000000000000000000000000e+01; }
 };
 
-inline double lanczos_g_near_1_and_2(const lanczos52MP&)
+BOOST_MATH_GPU_ENABLED inline double lanczos_g_near_1_and_2(const lanczos52MP&)
 {
    return 38.73733398437500;
 }
@@ -2721,24 +2725,24 @@ inline double lanczos_g_near_1_and_2(const lanczos52MP&)
 //
 // placeholder for no lanczos info available:
 //
-struct undefined_lanczos : public std::integral_constant<int, (std::numeric_limits<int>::max)() - 1> { };
+struct undefined_lanczos : public boost::math::integral_constant<int, (boost::math::numeric_limits<int>::max)() - 1> { };
 
 template <class Real, class Policy>
 struct lanczos
 {
-   BOOST_MATH_STATIC constexpr auto target_precision = policies::precision<Real, Policy>::type::value <= 0 ? (std::numeric_limits<int>::max)()-2 : 
+   BOOST_MATH_STATIC constexpr auto target_precision = policies::precision<Real, Policy>::type::value <= 0 ? (boost::math::numeric_limits<int>::max)()-2 : 
                                                                                                    policies::precision<Real, Policy>::type::value;
 
-   using type = typename std::conditional<(target_precision <= lanczos6m24::value), lanczos6m24, 
-                typename std::conditional<(target_precision <= lanczos13m53::value), lanczos13m53,
-                typename std::conditional<(target_precision <= lanczos11::value), lanczos11,
-                typename std::conditional<(target_precision <= lanczos17m64::value), lanczos17m64,
-                typename std::conditional<(target_precision <= lanczos24m113::value), lanczos24m113,
-                typename std::conditional<(target_precision <= lanczos27MP::value), lanczos27MP,
-                typename std::conditional<(target_precision <= lanczos35MP::value), lanczos35MP,
-                typename std::conditional<(target_precision <= lanczos48MP::value), lanczos48MP,
-                typename std::conditional<(target_precision <= lanczos49MP::value), lanczos49MP,
-                typename std::conditional<(target_precision <= lanczos52MP::value), lanczos52MP, undefined_lanczos>::type
+   using type = typename boost::math::conditional<(target_precision <= lanczos6m24::value), lanczos6m24, 
+                typename boost::math::conditional<(target_precision <= lanczos13m53::value), lanczos13m53,
+                typename boost::math::conditional<(target_precision <= lanczos11::value), lanczos11,
+                typename boost::math::conditional<(target_precision <= lanczos17m64::value), lanczos17m64,
+                typename boost::math::conditional<(target_precision <= lanczos24m113::value), lanczos24m113,
+                typename boost::math::conditional<(target_precision <= lanczos27MP::value), lanczos27MP,
+                typename boost::math::conditional<(target_precision <= lanczos35MP::value), lanczos35MP,
+                typename boost::math::conditional<(target_precision <= lanczos48MP::value), lanczos48MP,
+                typename boost::math::conditional<(target_precision <= lanczos49MP::value), lanczos49MP,
+                typename boost::math::conditional<(target_precision <= lanczos52MP::value), lanczos52MP, undefined_lanczos>::type
                 >::type>::type>::type>::type>::type>::type>::type>::type
                 >::type;
 };
diff --git a/include/boost/math/special_functions/modf.hpp b/include/boost/math/special_functions/modf.hpp
index e08945dca..6e372ec9a 100644
--- a/include/boost/math/special_functions/modf.hpp
+++ b/include/boost/math/special_functions/modf.hpp
@@ -11,9 +11,13 @@
 #pragma once
 #endif
 
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/trunc.hpp>
+#include <boost/math/policies/policy.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/special_functions/math_fwd.hpp>
+#endif
 
 namespace boost{ namespace math{
 
diff --git a/include/boost/math/special_functions/pow.hpp b/include/boost/math/special_functions/pow.hpp
index a056f6daf..7a1bb14eb 100644
--- a/include/boost/math/special_functions/pow.hpp
+++ b/include/boost/math/special_functions/pow.hpp
@@ -14,11 +14,13 @@
 #define BOOST_MATH_POW_HPP
 
 #include <boost/math/tools/config.hpp>
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/policies/policy.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/tools/promotion.hpp>
 
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/special_functions/math_fwd.hpp>
+#endif
 
 namespace boost {
 namespace math {
@@ -35,7 +37,7 @@ template <int N, int M = N%2>
 struct positive_power
 {
     template <typename T>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base)
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base)
     {
         T power = positive_power<N/2>::result(base);
         return power * power;
@@ -46,7 +48,7 @@ template <int N>
 struct positive_power<N, 1>
 {
     template <typename T>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base)
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base)
     {
         T power = positive_power<N/2>::result(base);
         return base * power * power;
@@ -57,7 +59,7 @@ template <>
 struct positive_power<1, 1>
 {
     template <typename T>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base){ return base; }
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base){ return base; }
 };
 
 
@@ -65,7 +67,7 @@ template <int N, bool>
 struct power_if_positive
 {
     template <typename T, class Policy>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base, const Policy&)
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base, const Policy&)
     { return positive_power<N>::result(base); }
 };
 
@@ -73,7 +75,7 @@ template <int N>
 struct power_if_positive<N, false>
 {
     template <typename T, class Policy>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base, const Policy& policy)
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base, const Policy& policy)
     {
         if (base == 0)
         {
@@ -92,7 +94,7 @@ template <>
 struct power_if_positive<0, true>
 {
     template <typename T, class Policy>
-    BOOST_MATH_GPU_ENABLED static BOOST_MATH_CXX14_CONSTEXPR T result(T base, const Policy& policy)
+    BOOST_MATH_GPU_ENABLED static constexpr T result(T base, const Policy& policy)
     {
         if (base == 0)
         {
@@ -121,14 +123,14 @@ struct select_power_if_positive
 
 
 template <int N, typename T, class Policy>
-BOOST_MATH_GPU_ENABLED BOOST_MATH_CXX14_CONSTEXPR inline typename tools::promote_args<T>::type pow(T base, const Policy& policy)
+BOOST_MATH_GPU_ENABLED constexpr inline typename tools::promote_args<T>::type pow(T base, const Policy& policy)
 { 
    using result_type = typename tools::promote_args<T>::type;
    return detail::select_power_if_positive<N>::type::result(static_cast<result_type>(base), policy); 
 }
 
 template <int N, typename T>
-BOOST_MATH_GPU_ENABLED BOOST_MATH_CXX14_CONSTEXPR inline typename tools::promote_args<T>::type pow(T base)
+BOOST_MATH_GPU_ENABLED constexpr inline typename tools::promote_args<T>::type pow(T base)
 { return pow<N>(base, policies::policy<>()); }
 
 #ifdef _MSC_VER
diff --git a/include/boost/math/special_functions/trigamma.hpp b/include/boost/math/special_functions/trigamma.hpp
index 2ec6c5256..61a60b502 100644
--- a/include/boost/math/special_functions/trigamma.hpp
+++ b/include/boost/math/special_functions/trigamma.hpp
@@ -11,16 +11,22 @@
 #pragma once
 #endif
 
-#include <type_traits>
 #include <boost/math/tools/config.hpp>
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/rational.hpp>
-#include <boost/math/tools/series.hpp>
 #include <boost/math/tools/promotion.hpp>
+#include <boost/math/tools/big_constant.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/policies/policy.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/constants/constants.hpp>
-#include <boost/math/tools/big_constant.hpp>
+#include <boost/math/special_functions/sin_pi.hpp>
+#include <boost/math/special_functions/pow.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/polygamma.hpp>
+#include <boost/math/tools/series.hpp>
+#endif
 
 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -36,17 +42,20 @@ namespace boost{
 namespace math{
 namespace detail{
 
+// TODO(mborland): Temporary for NVRTC
+#ifndef BOOST_MATH_HAS_NVRTC
 template<class T, class Policy>
 T polygamma_imp(const int n, T x, const Policy &pol);
 
 template <class T, class Policy>
-T trigamma_prec(T x, const Policy& pol, const std::integral_constant<int, 0>&)
+T trigamma_prec(T x, const Policy& pol, const boost::math::integral_constant<int, 0>&)
 {
    return polygamma_imp(1, x, pol);
 }
+#endif
 
 template <class T, class Policy>
-BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const boost::math::integral_constant<int, 53>&)
 {
    // Max error in interpolated form: 3.736e-017
    BOOST_MATH_STATIC const T offset = BOOST_MATH_BIG_CONSTANT(T, 53, 2.1093254089355469);
@@ -119,7 +128,7 @@ BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const std::integral_c
 }
 
 template <class T, class Policy>
-BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const boost::math::integral_constant<int, 64>&)
 {
    // Max error in interpolated form: 1.178e-020
    BOOST_MATH_STATIC const T offset_1_2 = BOOST_MATH_BIG_CONSTANT(T, 64, 2.109325408935546875);
@@ -197,7 +206,7 @@ BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const std::integral_c
 }
 
 template <class T, class Policy>
-T trigamma_prec(T x, const Policy&, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T trigamma_prec(T x, const Policy&, const boost::math::integral_constant<int, 113>&)
 {
    // Max error in interpolated form: 1.916e-035
 
@@ -416,13 +425,13 @@ struct trigamma_initializer
       BOOST_MATH_GPU_ENABLED init()
       {
          typedef typename policies::precision<T, Policy>::type precision_type;
-         do_init(std::integral_constant<bool, precision_type::value && (precision_type::value <= 113)>());
+         do_init(boost::math::integral_constant<bool, precision_type::value && (precision_type::value <= 113)>());
       }
-      BOOST_MATH_GPU_ENABLED void do_init(const std::true_type&)
+      BOOST_MATH_GPU_ENABLED void do_init(const boost::math::true_type&)
       {
          boost::math::trigamma(T(2.5), Policy());
       }
-      BOOST_MATH_GPU_ENABLED void do_init(const std::false_type&){}
+      BOOST_MATH_GPU_ENABLED void do_init(const boost::math::false_type&){}
       BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
    };
    static const init initializer;
@@ -446,7 +455,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
    typedef typename policies::precision<T, Policy>::type precision_type;
-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
       precision_type::value <= 0 ? 0 :
       precision_type::value <= 53 ? 53 :
       precision_type::value <= 64 ? 64 :
diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp
index d462ca809..f93d2958d 100644
--- a/include/boost/math/tools/complex.hpp
+++ b/include/boost/math/tools/complex.hpp
@@ -10,9 +10,17 @@
 #ifndef BOOST_MATH_TOOLS_COMPLEX_HPP
 #define BOOST_MATH_TOOLS_COMPLEX_HPP
 
-#include <utility>
+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/is_detected.hpp>
 
+#ifdef BOOST_MATH_ENABLE_CUDA
+#include <cuda/std/utility>
+#endif
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <utility>
+#endif
+
 namespace boost {
    namespace math {
       namespace tools {
@@ -24,12 +32,21 @@ namespace boost {
             static constexpr bool value = false;
          };
 
+         #ifndef BOOST_MATH_ENABLE_CUDA
          template <typename T>
          struct is_complex_type_impl<T, void_t<decltype(std::declval<T>().real()), 
                                                decltype(std::declval<T>().imag())>>
          {
             static constexpr bool value = true;
          };
+         #else
+         template <typename T>
+         struct is_complex_type_impl<T, void_t<decltype(cuda::std::declval<T>().real()), 
+                                               decltype(cuda::std::declval<T>().imag())>>
+         {
+            static constexpr bool value = true;
+         };
+         #endif
          } // Namespace detail
 
          template <typename T>
diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp
index 364aa8f54..2736d660f 100644
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -675,6 +675,7 @@ namespace boost{ namespace math{
 #include <cuda_runtime.h>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
+#include <cuda/std/cstdint>
 
 #  define BOOST_MATH_CUDA_ENABLED __host__ __device__
 #  define BOOST_MATH_HAS_GPU_SUPPORT
@@ -775,11 +776,6 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur
 #define BOOST_MATH_FP_SUBNORMAL FP_SUBNORMAL
 #define BOOST_MATH_FP_NORMAL FP_NORMAL
 
-// Missing type from NVRTC
-#include <cstdint>
-#define BOOST_MATH_SIZE_T std::size_t
-#define BOOST_MATH_UINTMAX_T std::uintmax_t
-
 #else // Special section for CUDA NVRTC to ensure we consume no STL headers
 
 #ifndef BOOST_MATH_STANDALONE
@@ -824,8 +820,8 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b;
 #define BOOST_MATH_FP_SUBNORMAL 3
 #define BOOST_MATH_FP_NORMAL 4
 
-#define BOOST_MATH_SIZE_T unsigned long
-#define BOOST_MATH_UINTMAX_T unsigned long
+#define BOOST_MATH_INT_VALUE_SUFFIX(RV, SUF) RV##SUF
+#define BOOST_MATH_INT_TABLE_TYPE(RT, IT) IT
 
 #if defined(__cpp_inline_variables) && __cpp_inline_variables >= 201606L
 #  define BOOST_MATH_INLINE_CONSTEXPR inline constexpr
diff --git a/include/boost/math/tools/cstdint.hpp b/include/boost/math/tools/cstdint.hpp
new file mode 100644
index 000000000..ce2c913b5
--- /dev/null
+++ b/include/boost/math/tools/cstdint.hpp
@@ -0,0 +1,107 @@
+//  Copyright (c) 2024 Matt Borland
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_TOOLS_CSTDINT
+#define BOOST_MATH_TOOLS_CSTDINT
+
+#include <boost/math/tools/config.hpp>
+
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+
+#include <cuda/std/cstdint>
+
+namespace boost {
+namespace math {
+
+using cuda::std::int8_t;
+using cuda::std::int16_t;
+using cuda::std::int32_t;
+using cuda::std::int64_t;
+
+using cuda::std::int_fast8_t;
+using cuda::std::int_fast16_t;
+using cuda::std::int_fast32_t;
+using cuda::std::int_fast64_t;
+
+using cuda::std::int_least8_t;
+using cuda::std::int_least16_t;
+using cuda::std::int_least32_t;
+using cuda::std::int_least64_t;
+
+using cuda::std::intmax_t;
+using cuda::std::intptr_t;
+
+using cuda::std::uint8_t;
+using cuda::std::uint16_t;
+using cuda::std::uint32_t;
+using cuda::std::uint64_t;
+
+using cuda::std::uint_fast8_t;
+using cuda::std::uint_fast16_t;
+using cuda::std::uint_fast32_t;
+using cuda::std::uint_fast64_t;
+
+using cuda::std::uint_least8_t;
+using cuda::std::uint_least16_t;
+using cuda::std::uint_least32_t;
+using cuda::std::uint_least64_t;
+
+using cuda::std::uintmax_t;
+using cuda::std::uintptr_t;
+
+using size_t = unsigned long;
+
+#else
+
+#include <cstdint>
+
+namespace boost {
+namespace math {
+
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+
+using std::int_fast8_t;
+using std::int_fast16_t;
+using std::int_fast32_t;
+using std::int_fast64_t;
+
+using std::int_least8_t;
+using std::int_least16_t;
+using std::int_least32_t;
+using std::int_least64_t;
+
+using std::intmax_t;
+using std::intptr_t;
+
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+using std::uint_fast8_t;
+using std::uint_fast16_t;
+using std::uint_fast32_t;
+using std::uint_fast64_t;
+
+using std::uint_least8_t;
+using std::uint_least16_t;
+using std::uint_least32_t;
+using std::uint_least64_t;
+
+using std::uintmax_t;
+using std::uintptr_t;
+
+using std::size_t;
+
+#endif
+
+} // namespace math
+} // namespace boost
+
+#endif // BOOST_MATH_TOOLS_CSTDINT
diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp
index 1970e5ca3..e5a31edf6 100644
--- a/include/boost/math/tools/fraction.hpp
+++ b/include/boost/math/tools/fraction.hpp
@@ -12,9 +12,12 @@
 #endif
 
 #include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/tuple.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/complex.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <type_traits>
 #include <cstdint>
 #include <cmath>
@@ -25,10 +28,10 @@ namespace detail
 {
 
    template <typename T>
-   struct is_pair : public std::false_type{};
+   struct is_pair : public boost::math::false_type{};
 
    template <typename T, typename U>
-   struct is_pair<boost::math::pair<T,U>> : public std::true_type{};
+   struct is_pair<boost::math::pair<T,U>> : public boost::math::true_type{};
 
    template <typename Gen>
    struct fraction_traits_simple
@@ -110,7 +113,7 @@ namespace detail {
 //
 
 template <typename Gen, typename U>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b_impl(Gen& g, const U& factor, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b_impl(Gen& g, const U& factor, boost::math::uintmax_t& max_terms)
       noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
       #ifndef BOOST_MATH_ENABLE_SYCL
       // SYCL can not handle this condition so we only check float on that platform
@@ -140,7 +143,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    C = f;
    D = 0;
 
-   std::uintmax_t counter(max_terms);
+   boost::math::uintmax_t counter(max_terms);
    do{
       v = g();
       D = traits::b(v) + traits::a(v) * D;
@@ -162,7 +165,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
 } // namespace detail
 
 template <typename Gen, typename U>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, boost::math::uintmax_t& max_terms)
    noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
          #ifndef BOOST_MATH_ENABLE_SYCL
          && noexcept(std::declval<Gen>()())
@@ -180,7 +183,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    #endif
    )
 {
-   std::uintmax_t max_terms = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t max_terms = (std::numeric_limits<boost::math::uintmax_t>::max)();
    return detail::continued_fraction_b_impl(g, factor, max_terms);
 }
 
@@ -198,12 +201,12 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    using result_type = typename traits::result_type;
 
    result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits);
-   std::uintmax_t max_terms = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t max_terms = (std::numeric_limits<boost::math::uintmax_t>::max)();
    return detail::continued_fraction_b_impl(g, factor, max_terms);
 }
 
 template <typename Gen>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, boost::math::uintmax_t& max_terms)
    noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
    #ifndef BOOST_MATH_ENABLE_SYCL
    && noexcept(std::declval<Gen>()())
@@ -236,7 +239,7 @@ namespace detail {
 // Note that the first a1 and b1 returned by generator Gen are both used.
 //
 template <typename Gen, typename U>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a_impl(Gen& g, const U& factor, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a_impl(Gen& g, const U& factor, boost::math::uintmax_t& max_terms)
    noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
    #ifndef BOOST_MATH_ENABLE_SYCL
    && noexcept(std::declval<Gen>()())
@@ -266,7 +269,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    C = f;
    D = 0;
 
-   std::uintmax_t counter(max_terms);
+   boost::math::uintmax_t counter(max_terms);
 
    do{
       v = g();
@@ -289,7 +292,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
 } // namespace detail
 
 template <typename Gen, typename U>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, boost::math::uintmax_t& max_terms)
    noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
    #ifndef BOOST_MATH_ENABLE_SYCL
    && noexcept(std::declval<Gen>()())
@@ -307,7 +310,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    #endif
    )
 {
-   std::uintmax_t max_iter = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t max_iter = (std::numeric_limits<boost::math::uintmax_t>::max)();
    return detail::continued_fraction_a_impl(g, factor, max_iter);
 }
 
@@ -325,13 +328,13 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type
    typedef typename traits::result_type result_type;
 
    result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits);
-   std::uintmax_t max_iter = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t max_iter = (std::numeric_limits<boost::math::uintmax_t>::max)();
 
    return detail::continued_fraction_a_impl(g, factor, max_iter);
 }
 
 template <typename Gen>
-BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, std::uintmax_t& max_terms)
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, boost::math::uintmax_t& max_terms)
    noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
    #ifndef BOOST_MATH_ENABLE_SYCL
    && noexcept(std::declval<Gen>()())
diff --git a/include/boost/math/tools/is_detected.hpp b/include/boost/math/tools/is_detected.hpp
index 8dfe86b74..93fa96f60 100644
--- a/include/boost/math/tools/is_detected.hpp
+++ b/include/boost/math/tools/is_detected.hpp
@@ -8,7 +8,7 @@
 #ifndef BOOST_MATH_TOOLS_IS_DETECTED_HPP
 #define BOOST_MATH_TOOLS_IS_DETECTED_HPP
 
-#include <type_traits>
+#include <boost/math/tools/type_traits.hpp>
 
 namespace boost { namespace math { namespace tools {
 
@@ -20,14 +20,14 @@ namespace detail {
 template <typename Default, typename AlwaysVoid, template<typename...> class Op, typename... Args>
 struct detector
 {
-    using value_t = std::false_type;
+    using value_t = boost::math::false_type;
     using type = Default;
 };
 
 template <typename Default, template<typename...> class Op, typename... Args>
 struct detector<Default, void_t<Op<Args...>>, Op, Args...>
 {
-    using value_t = std::true_type;
+    using value_t = boost::math::true_type;
     using type = Op<Args...>;
 };
 
diff --git a/include/boost/math/tools/mp.hpp b/include/boost/math/tools/mp.hpp
index 66fcbdbaf..560ae8b50 100644
--- a/include/boost/math/tools/mp.hpp
+++ b/include/boost/math/tools/mp.hpp
@@ -13,6 +13,7 @@
 
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 
 namespace boost { namespace math { namespace tools { namespace meta_programming {
 
@@ -22,8 +23,8 @@ template<typename... T>
 struct mp_list {};
 
 // Size_t
-template<BOOST_MATH_SIZE_T N> 
-using mp_size_t = boost::math::integral_constant<BOOST_MATH_SIZE_T, N>;
+template<boost::math::size_t N> 
+using mp_size_t = boost::math::integral_constant<boost::math::size_t, N>;
 
 // Boolean
 template<bool B>
@@ -52,7 +53,7 @@ struct mp_size_impl {};
 template<template<typename...> class L, typename... T> // Template template parameter must use class
 struct mp_size_impl<L<T...>>
 {
-    using type = boost::math::integral_constant<BOOST_MATH_SIZE_T, sizeof...(T)>;
+    using type = boost::math::integral_constant<boost::math::size_t, sizeof...(T)>;
 };
 }
 
@@ -78,7 +79,7 @@ namespace detail {
 // At
 // TODO - Use tree based lookup for larger typelists
 // http://odinthenerd.blogspot.com/2017/04/tree-based-lookup-why-kvasirmpl-is.html
-template<typename L, BOOST_MATH_SIZE_T>
+template<typename L, boost::math::size_t>
 struct mp_at_c {};
 
 template<template<typename...> class L, typename T0, typename... T>
@@ -167,7 +168,7 @@ struct mp_at_c<L<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T...>, 1
 };
 }
 
-template<typename L, BOOST_MATH_SIZE_T Index>
+template<typename L, boost::math::size_t Index>
 using mp_at_c = typename detail::mp_at_c<L, Index>::type;
 
 template<typename L, typename Index>
@@ -338,8 +339,8 @@ using mp_remove_if_q = mp_remove_if<L, Q::template fn>;
 template<typename T, T... Index>
 struct integer_sequence {};
 
-template<BOOST_MATH_SIZE_T... Index>
-using index_sequence = integer_sequence<BOOST_MATH_SIZE_T, Index...>;
+template<boost::math::size_t... Index>
+using index_sequence = integer_sequence<boost::math::size_t, Index...>;
 
 namespace detail {
 
@@ -411,11 +412,11 @@ struct make_integer_sequence_impl
 template<typename T, T N>
 using make_integer_sequence = typename detail::make_integer_sequence_impl<T, N>::type;
 
-template<BOOST_MATH_SIZE_T N>
-using make_index_sequence = make_integer_sequence<BOOST_MATH_SIZE_T, N>;
+template<boost::math::size_t N>
+using make_index_sequence = make_integer_sequence<boost::math::size_t, N>;
 
 template<typename... T>
-using index_sequence_for = make_integer_sequence<BOOST_MATH_SIZE_T, sizeof...(T)>;
+using index_sequence_for = make_integer_sequence<boost::math::size_t, sizeof...(T)>;
 
 }}}} // namespaces
 
diff --git a/include/boost/math/tools/rational.hpp b/include/boost/math/tools/rational.hpp
index e3c3e7e33..a535abcdc 100644
--- a/include/boost/math/tools/rational.hpp
+++ b/include/boost/math/tools/rational.hpp
@@ -13,6 +13,7 @@
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/assert.hpp>
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 
 #ifndef BOOST_MATH_HAS_NVRTC
 #include <array>
@@ -172,7 +173,7 @@ namespace boost{ namespace math{ namespace tools{
 // Forward declaration to keep two phase lookup happy:
 //
 template <class T, class U>
-BOOST_MATH_GPU_ENABLED U evaluate_polynomial(const T* poly, U const& z, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(U);
+BOOST_MATH_GPU_ENABLED U evaluate_polynomial(const T* poly, U const& z, boost::math::size_t count) BOOST_MATH_NOEXCEPT(U);
 
 namespace detail{
 
@@ -190,7 +191,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_polynomial_c_imp
 // the loop expanded versions above:
 //
 template <class T, class U>
-BOOST_MATH_GPU_ENABLED inline U evaluate_polynomial(const T* poly, U const& z, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(U)
+BOOST_MATH_GPU_ENABLED inline U evaluate_polynomial(const T* poly, U const& z, boost::math::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
    BOOST_MATH_ASSERT(count > 0);
    U sum = static_cast<U>(poly[count - 1]);
@@ -205,7 +206,7 @@ BOOST_MATH_GPU_ENABLED inline U evaluate_polynomial(const T* poly, U const& z, B
 // Compile time sized polynomials, just inline forwarders to the
 // implementations above:
 //
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_polynomial(const T(&a)[N], const V& val) BOOST_MATH_NOEXCEPT(V)
 {
    typedef boost::math::integral_constant<int, N> tag_type;
@@ -213,7 +214,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_polynomial(const
 }
 
 #ifndef BOOST_MATH_HAS_NVRTC
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_polynomial(const std::array<T,N>& a, const V& val) BOOST_MATH_NOEXCEPT(V)
 {
    typedef boost::math::integral_constant<int, N> tag_type;
@@ -224,19 +225,19 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_polynomial(const
 // Even polynomials are trivial: just square the argument!
 //
 template <class T, class U>
-BOOST_MATH_GPU_ENABLED inline U evaluate_even_polynomial(const T* poly, U z, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(U)
+BOOST_MATH_GPU_ENABLED inline U evaluate_even_polynomial(const T* poly, U z, boost::math::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
    return evaluate_polynomial(poly, U(z*z), count);
 }
 
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_even_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return evaluate_polynomial(a, V(z*z));
 }
 
 #ifndef BOOST_MATH_HAS_NVRTC
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_even_polynomial(const std::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return evaluate_polynomial(a, V(z*z));
@@ -246,12 +247,12 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_even_polynomial(
 // Odd polynomials come next:
 //
 template <class T, class U>
-BOOST_MATH_GPU_ENABLED inline U evaluate_odd_polynomial(const T* poly, U z, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(U)
+BOOST_MATH_GPU_ENABLED inline U evaluate_odd_polynomial(const T* poly, U z, boost::math::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
    return poly[0] + z * evaluate_polynomial(poly+1, U(z*z), count-1);
 }
 
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_odd_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    typedef boost::math::integral_constant<int, N-1> tag_type;
@@ -259,7 +260,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_odd_polynomial(c
 }
 
 #ifndef BOOST_MATH_HAS_NVRTC
-template <BOOST_MATH_SIZE_T N, class T, class V>
+template <boost::math::size_t N, class T, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_odd_polynomial(const std::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    typedef boost::math::integral_constant<int, N-1> tag_type;
@@ -268,7 +269,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_odd_polynomial(c
 #endif
 
 template <class T, class U, class V>
-BOOST_MATH_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(V);
+BOOST_MATH_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, boost::math::size_t count) BOOST_MATH_NOEXCEPT(V);
 
 namespace detail{
 
@@ -288,7 +289,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_rational_c_imp(c
 // in our Lanczos code for example.
 //
 template <class T, class U, class V>
-BOOST_MATH_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, BOOST_MATH_SIZE_T count) BOOST_MATH_NOEXCEPT(V)
+BOOST_MATH_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, boost::math::size_t count) BOOST_MATH_NOEXCEPT(V)
 {
    V z(z_);
    V s1, s2;
@@ -320,14 +321,14 @@ BOOST_MATH_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V
    return s1 / s2;
 }
 
-template <BOOST_MATH_SIZE_T N, class T, class U, class V>
+template <boost::math::size_t N, class T, class U, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_rational(const T(&a)[N], const U(&b)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return detail::evaluate_rational_c_imp(a, b, z, static_cast<const boost::math::integral_constant<int, N>*>(nullptr));
 }
 
 #ifndef BOOST_MATH_HAS_NVRTC
-template <BOOST_MATH_SIZE_T N, class T, class U, class V>
+template <boost::math::size_t N, class T, class U, class V>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_GPU_ENABLED inline V evaluate_rational(const std::array<T,N>& a, const std::array<U,N>& b, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return detail::evaluate_rational_c_imp(a.data(), b.data(), z, static_cast<boost::math::integral_constant<int, N>*>(nullptr));
diff --git a/include/boost/math/tools/roots.hpp b/include/boost/math/tools/roots.hpp
index 97e67fae9..8f36aa22d 100644
--- a/include/boost/math/tools/roots.hpp
+++ b/include/boost/math/tools/roots.hpp
@@ -9,6 +9,11 @@
 #ifdef _MSC_VER
 #pragma once
 #endif
+
+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC // Disabled for now
+
 #include <boost/math/tools/complex.hpp> // test for multiprecision types in complex Newton
 
 #include <utility>
@@ -16,7 +21,6 @@
 #include <tuple>
 #include <cstdint>
 
-#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/cxx03_warn.hpp>
 
 #include <boost/math/special_functions/sign.hpp>
@@ -1025,4 +1029,6 @@ inline std::pair<typename tools::promote_args<T1, T2, T3>::type, typename tools:
 } // namespace math
 } // namespace boost
 
+#endif // BOOST_MATH_HAS_NVRTC
+
 #endif // BOOST_MATH_TOOLS_NEWTON_SOLVER_HPP
diff --git a/include/boost/math/tools/type_traits.hpp b/include/boost/math/tools/type_traits.hpp
index a37e8e7eb..a13332797 100644
--- a/include/boost/math/tools/type_traits.hpp
+++ b/include/boost/math/tools/type_traits.hpp
@@ -12,13 +12,13 @@
 
 #include <boost/math/tools/config.hpp>
 
-namespace boost {
-namespace math {
-
 #ifdef BOOST_MATH_ENABLE_CUDA
 
 #include <cuda/std/type_traits>
 
+namespace boost {
+namespace math {
+
 // Helper classes
 using cuda::std::integral_constant;
 using cuda::std::true_type;
@@ -163,6 +163,9 @@ using cuda::std::underlying_type_t;
 
 #include <type_traits>
 
+namespace boost {
+namespace math {
+
 // Helper classes
 using std::integral_constant;
 using std::true_type;
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 8d33eabb5..7e57f93ce 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -88,6 +88,8 @@ run test_saspoint5_quan_nvrtc_double.cpp ;
 run test_saspoint5_quan_nvrtc_float.cpp ;
 
 # Special Functions
+run test_beta_nvrtc_double.cpp ;
+run test_beta_nvrtc_float.cpp ;
 run test_cbrt_nvrtc_double.cpp ;
 run test_cbrt_nvrtc_float.cpp ;
 run test_cos_pi_nvrtc_double.cpp ;
@@ -110,10 +112,14 @@ run test_gamma_nvrtc_double.cpp ;
 run test_gamma_nvrtc_float.cpp ;
 run test_log1p_nvrtc_double.cpp ;
 run test_log1p_nvrtc_float.cpp ;
+run test_modf_nvrtc_double.cpp ;
+run test_modf_nvrtc_float.cpp ;
 run test_round_nvrtc_double.cpp ;
 run test_round_nvrtc_float.cpp ;
 run test_sign_nvrtc_double.cpp ;
 run test_sign_nvrtc_float.cpp ;
 run test_sin_pi_nvrtc_double.cpp ;
 run test_sin_pi_nvrtc_float.cpp ;
+run test_trigamma_nvrtc_double.cpp ;
+run test_trigamma_nvrtc_float.cpp ;
 run test_trunc_nvrtc_double.cpp ;
diff --git a/test/test_beta_nvrtc_double.cpp b/test/test_beta_nvrtc_double.cpp
new file mode 100644
index 000000000..fdc502a19
--- /dev/null
+++ b/test/test_beta_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::beta(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::beta(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_nvrtc_float.cpp b/test/test_beta_nvrtc_float.cpp
new file mode 100644
index 000000000..d403d3315
--- /dev/null
+++ b/test/test_beta_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::beta(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::beta(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_modf_nvrtc_double.cpp b/test/test_modf_nvrtc_double.cpp
new file mode 100644
index 000000000..f172dd52c
--- /dev/null
+++ b/test/test_modf_nvrtc_double.cpp
@@ -0,0 +1,200 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/modf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/modf.hpp>
+extern "C" __global__ 
+void test_modf_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    float_type fract;
+    int i_part;
+    long l_part;
+    long long ll_part;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::modf(in1[i], &fract) + 
+                 boost::math::modf(in1[i], &i_part) + 
+                 boost::math::modf(in1[i], &l_part) + 
+                 boost::math::modf(in1[i], &ll_part);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_modf_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_modf_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_modf_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            float_type fract;
+            const auto res = 4 * boost::math::modf(h_in1[i], &fract);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_modf_nvrtc_float.cpp b/test/test_modf_nvrtc_float.cpp
new file mode 100644
index 000000000..1dcd3c081
--- /dev/null
+++ b/test/test_modf_nvrtc_float.cpp
@@ -0,0 +1,200 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/modf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/modf.hpp>
+extern "C" __global__ 
+void test_modf_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    float_type fract;
+    int i_part;
+    long l_part;
+    long long ll_part;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::modf(in1[i], &fract) + 
+                 boost::math::modf(in1[i], &i_part) + 
+                 boost::math::modf(in1[i], &l_part) + 
+                 boost::math::modf(in1[i], &ll_part);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_modf_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_modf_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_modf_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            float_type fract;
+            const auto res = 4 * boost::math::modf(h_in1[i], &fract);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_trigamma_nvrtc_double.cpp b/test/test_trigamma_nvrtc_double.cpp
new file mode 100644
index 000000000..46877acce
--- /dev/null
+++ b/test/test_trigamma_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/trigamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/trigamma.hpp>
+extern "C" __global__ 
+void test_trigamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::trigamma(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trigamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trigamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trigamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::trigamma(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_trigamma_nvrtc_float.cpp b/test/test_trigamma_nvrtc_float.cpp
new file mode 100644
index 000000000..083c7d876
--- /dev/null
+++ b/test/test_trigamma_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/trigamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/trigamma.hpp>
+extern "C" __global__ 
+void test_trigamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::trigamma(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trigamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trigamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trigamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::trigamma(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}