ROCm · dlangbe · Aug 1, 2024 · Jul 24, 2024 · Jul 26, 2024 · Jul 29, 2024
@@ -254,6 +254,102 @@ namespace hiptensor
             return;
         }
     }
+
+    std::string computeTypeToString(hiptensorComputeType_t computeType)
+    {
+        if(computeType == HIPTENSOR_COMPUTE_16BF)
+        {
+            return "HIPTENSOR_COMPUTE_16BF";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_16F)
+        {
+            return "HIPTENSOR_COMPUTE_16F";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return "HIPTENSOR_COMPUTE_32F";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_64F)
+        {
+            return "HIPTENSOR_COMPUTE_64F";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_8I)
+        {
+            return "HIPTENSOR_COMPUTE_8I";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_8U)
+        {
+            return "HIPTENSOR_COMPUTE_8U";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_32I)
+        {
+            return "HIPTENSOR_COMPUTE_32I";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_32U)
+        {
+            return "HIPTENSOR_COMPUTE_32U";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            return "HIPTENSOR_COMPUTE_C32F";
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return "HIPTENSOR_COMPUTE_C64F";
+        }
+        else
+        {
+            return "HIPTENSOR_COMPUTE_NONE";
+        }
+    }
+
+    std::string hipTypeToString(hipDataType hipType)
+    {
+        if(hipType == HIP_R_16BF)
+        {
+            return "HIP_R_16BF";
+        }
+        else if(hipType == HIP_R_16F)
+        {
+            return "HIP_R_16F";
+        }
+        else if(hipType == HIP_R_32F)
+        {
+            return "HIP_R_32F";
+        }
+        else if(hipType == HIP_R_64F)
+        {
+            return "HIP_R_64F";
+        }
+        else if(hipType == HIP_R_8I)
+        {
+            return "HIP_R_8I";
+        }
+        else if(hipType == HIP_R_8U)
+        {
+            return "HIP_R_8U";
+        }
+        else if(hipType == HIP_R_32I)
+        {
+            return "HIP_R_32I";
+        }
+        else if(hipType == HIP_R_32U)
+        {
+            return "HIP_R_32U";
+        }
+        else if(hipType == HIP_C_32F)
+        {
+            return "HIP_C_32F";
+        }
+        else if(hipType == HIP_C_64F)
+        {
+            return "HIP_C_64F";
+        }
+        else
+        {
+            return "HIP_TYPE_NONE";
+        }
+    }
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)

@@ -107,6 +107,9 @@ namespace hiptensor
     T readVal(void const* value, hiptensorComputeType_t id);
 
     void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value);
+
+    std::string computeTypeToString(hiptensorComputeType_t computeType);
+    std::string hipTypeToString(hipDataType hipType);
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);

@@ -656,13 +656,36 @@ namespace hiptensor
             auto reference = resource->allocDevice(sizeD);
             resource->copyData(reference, resource->hostD(), sizeD);
 
+            // Compute tolerance based on compute type
+            auto dimension = a_ms_ks.mLengths.size() / 2;
+            auto nelems_k = std::accumulate(a_ms_ks.mLengths.begin() + dimension,
+                                            a_ms_ks.mLengths.end(),
+                                            size_t{1},
+                                            std::multiplies<size_t>());
+
+            auto eps = getEpsilon(computeType);
+            double tolerance = 2 * nelems_k * eps;
+
+            // use the same default tolerance value as CK
+            if (computeType == HIPTENSOR_COMPUTE_16BF || DDataType == HIP_R_16BF)
+            {
+                const double epsilon = std::pow(2, -7);
+                tolerance += epsilon * 2;
+            }
+            else if (computeType == HIPTENSOR_COMPUTE_16F || DDataType == HIP_R_16F)
+            {
+                const double epsilon = std::pow(2, -10);
+                tolerance += epsilon * 2;
+            }
+
             if(DDataType == HIP_R_16F)
             {
                 std::tie(mValidationResult, mMaxRelativeError)
                     = compareEqualLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
                                                          (_Float16*)reference.get(),
                                                          elementsCD,
-                                                         computeType);
+                                                         computeType,
+                                                         tolerance);
             }
             else if(DDataType == HIP_R_16BF)
             {
@@ -671,23 +694,26 @@ namespace hiptensor
                         (hip_bfloat16*)resource->deviceD().get(),
                         (hip_bfloat16*)reference.get(),
                         elementsCD,
-                        computeType);
+                        computeType,
+                        tolerance);
             }
             else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F)
             {
                 std::tie(mValidationResult, mMaxRelativeError)
                     = compareEqualLaunchKernel<float>((float*)resource->deviceD().get(),
                                                       (float*)reference.get(),
                                                       elementsCD,
-                                                      computeType);
+                                                      computeType,
+                                                      tolerance);
             }
             else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F)
             {
                 std::tie(mValidationResult, mMaxRelativeError)
                     = compareEqualLaunchKernel<double>((double*)resource->deviceD().get(),
                                                        (double*)reference.get(),
                                                        elementsCD,
-                                                       computeType);
+                                                       computeType,
+                                                       tolerance);
             }
 
             EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError;

@@ -65,28 +65,42 @@ __global__ static void
     }
 }
 
+__device__ inline unsigned pcg_hash(unsigned input)
+{
+    unsigned state = input * 747796405u + 2891336453u;
+    unsigned word  = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    return (word >> 22u) ^ word;
+}
+
+// gen random float in range [-range, range)
+template <unsigned range = 1>
+__device__ inline float gen_random_float(unsigned input)
+{
+    return (static_cast<float>(pcg_hash(input)) / static_cast<float>(UINT_MAX) - 0.5f)
+           * static_cast<float>(range) * 2;
+}
+
 // fill kernel for 'elementSize' elements
 template <typename DataType>
-__global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
+__global__ void fillKernel(DataType* data, uint32_t elementSize)
 {
     uint32_t index = (blockIdx.x * blockDim.x + threadIdx.x);
 
     if(index < elementSize)
     {
-        // Input values scaled by 10, Doing UnarySquare Operation for tensors(16F) may cause overflow.
         if constexpr(std::is_same_v<DataType, hipFloatComplex>)
         {
-            auto value  = (float(index / float(RAND_MAX) - 0.5) * 10) / elementSize;
+            auto value  = gen_random_float(index);
             data[index] = make_hipFloatComplex(value, value);
         }
         else if constexpr(std::is_same_v<DataType, hipDoubleComplex>)
         {
-            auto value  = (double(index / double(RAND_MAX) - 0.5) * 10) / elementSize;
+            auto value  = static_cast<double>(gen_random_float(index));
             data[index] = make_hipDoubleComplex(value, value);
         }
         else
         {
-            auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 10) / elementSize;
+            auto value  = gen_random_float(index);
             data[index] = static_cast<DataType>(value);
         }
     }

@@ -174,8 +174,7 @@ __host__ static inline void fillLaunchKernel(DataType* data, uint32_t elementSiz
                        0,
                        0,
                        data,
-                       elementSize,
-                       static_cast<uint32_t>(std::time(nullptr)));
+                       elementSize);
 }
 
 // fill kernel wrapper for 'elementSize' elements with a specific value
@@ -194,7 +193,7 @@ std::pair<bool, double> compareEqual(DDataType const*       deviceD,
                                      DDataType const*       hostD,
                                      std::size_t            elementsD,
                                      hiptensorComputeType_t computeType,
-                                     double                 tolerance = 100.0)
+                                     double                 tolerance = 0.0)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -245,7 +244,25 @@ std::pair<bool, double> compareEqual(DDataType const*       deviceD,
         }
     }
 
-    auto eps = getEpsilon(computeType);
+    if(tolerance == 0.0)
+    {
+        // use the same default tolerance value as CK
+        if (computeType == HIPTENSOR_COMPUTE_16BF || std::is_same_v<DDataType, hiptensor::bfloat16_t>)
+        {
+            const double epsilon = std::pow(2, -7);
+            tolerance = epsilon * 2;
+        }
+        else if (computeType == HIPTENSOR_COMPUTE_16F || std::is_same_v<DDataType, hiptensor::float16_t>)
+        {
+            const double epsilon = std::pow(2, -10);
+            tolerance = epsilon * 2;
+        }
+        else
+        {
+            tolerance = 1e-5;
+        }
+    }
+
     if(isInf)
     {
         retval             = false;
@@ -256,7 +273,7 @@ std::pair<bool, double> compareEqual(DDataType const*       deviceD,
         retval             = false;
         max_relative_error = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(max_relative_error > (eps * tolerance))
+    else if(max_relative_error > tolerance)
     {
         retval = false;
     }
@@ -269,13 +286,14 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*             deviceD,
                                                  DDataType*             hostD,
                                                  std::size_t            elementsD,
                                                  hiptensorComputeType_t computeType,
-                                                 double                 tolerance = 100.0)
+                                                 double                 tolerance = 0.0)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
 
     double* d_relativeError;
     double  maxRelativeError;
+
     CHECK_HIP_ERROR(hipMalloc(&d_relativeError, elementsD * sizeof(double)));
 
     hipEvent_t syncEvent;
@@ -332,12 +350,32 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*             deviceD,
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
     auto eps = getEpsilon(computeType);
+
+    if(tolerance == 0.0)
+    {
+        // use the same default tolerance value as CK
+        if (computeType == HIPTENSOR_COMPUTE_16BF || std::is_same_v<DDataType, hiptensor::bfloat16_t>)
+        {
+            const double epsilon = std::pow(2, -7);
+            tolerance = epsilon * 2;
+        }
+        else if (computeType == HIPTENSOR_COMPUTE_16F || std::is_same_v<DDataType, hiptensor::float16_t>)
+        {
+            const double epsilon = std::pow(2, -10);
+            tolerance = epsilon * 2;
+        }
+        else
+        {
+            tolerance = 1e-5;
+        }
+    }
+
     if(isNaN)
     {
         retval           = false;
         maxRelativeError = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(maxRelativeError > (eps * tolerance))
+    else if(maxRelativeError > (tolerance))
     {
         retval = false;
     }