Reduce f16 (ROCm#261)

- Support f16/f16, bf16/bf16, f16/f32, bf16/f32 reduction - Use f32 as compute type for f16/f16, bf16/bf16 - Support independent C and D - Add another CK instance with InSrcVectorDim==1 which supports far right dim - Support reducing a tensor to a scalar - Support permutation of output of reduction - Rename label [Reduced Dims] to [Output Dims] in reduction test config file - Commnet out all test cases that beta != 0. Will add the test cases back when the CK fix is merged into amd_master - Fix bugs
CongMa13 · Aug 20, 2024 · 6ff07da · 6ff07da
1 parent 2b39d23
commit 6ff07da
Show file tree

Hide file tree

Showing 111 changed files with 2,667 additions and 1,760 deletions.
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -27,9 +27,9 @@
 #define HIPTENSOR_UTILITY_INTERNAL_HPP
 
 #include <fstream>
+#include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
 #include <iostream>
-#include <hip/hip_complex.h>
 
 #include "../hiptensor_types.hpp"
 #include "types_ext.hpp"
@@ -131,6 +131,13 @@ void hiptensorPrintElementsToFile(std::ofstream& fs, F* output, size_t size, std
     return;
 }
 
+bool inline operator==(const hiptensorTensorDescriptor_t& lhs,
+                       const hiptensorTensorDescriptor_t& rhs)
+{
+    return lhs.mType == rhs.mType && lhs.mLengths == rhs.mLengths && lhs.mStrides == rhs.mStrides
+           && lhs.mUnaryOp == rhs.mUnaryOp;
+}
+
 namespace std
 {
     static ostream& operator<<(ostream& os, const hiptensorTensorDescriptor_t& desc)

diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
@@ -151,18 +151,17 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
         return HIPTENSOR_STATUS_NOT_INITIALIZED;
     }
 
-    if((lens == nullptr)
+    if((lens == nullptr && strides != nullptr)
        || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
-           && (dataType != HIP_R_64F) && (dataType != HIP_C_32F)
-           && (dataType != HIP_C_64F))
+           && (dataType != HIP_R_64F) && (dataType != HIP_C_32F) && (dataType != HIP_C_64F))
        || ((unaryOp != HIPTENSOR_OP_IDENTITY) && (unaryOp != HIPTENSOR_OP_SQRT)))
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
-        if(lens == nullptr)
+        if(lens == nullptr && strides != nullptr)
         {
             snprintf(msg,
                      sizeof(msg),
-                     "Tensor Initialization Error : lens = nullptr (%s)",
+                     "Tensor Initialization Error : lens = nullptr and strides != nullptr (%s)",
                      hiptensorGetErrorString(errorCode));
         }
         else if((unaryOp != HIPTENSOR_OP_IDENTITY) && (unaryOp != HIPTENSOR_OP_SQRT))
@@ -200,10 +199,15 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
     else
     {
         // Re-construct strides from lengths, assuming packed.
-        std::vector<std::size_t> l(lens, lens + numModes);
-        std::vector<std::size_t> s = hiptensor::stridesFromLengths(l);
-
-        *desc = {dataType, l, s, unaryOp};
+        if(numModes > 0)
+        {
+            auto lensVector = std::vector<std::size_t>(lens, lens + numModes);
+            *desc = {dataType, lensVector, hiptensor::stridesFromLengths(lensVector), unaryOp};
+        }
+        else
+        {
+            *desc = {dataType, std::vector<std::size_t>(), std::vector<std::size_t>(), unaryOp};
+        }
     }
 
     return HIPTENSOR_STATUS_SUCCESS;

diff --git a/library/src/include/util.hpp b/library/src/include/util.hpp
@@ -51,7 +51,7 @@ namespace hiptensor
         }
 
         // Re-construct strides from lengths, assuming packed.
-        std::vector<std::size_t> strides(lengths.size(), 1);
+        std::vector<T> strides(lengths.size(), 1);
         if(!col_major)
         {
             strides.back() = 1;
@@ -68,6 +68,7 @@ namespace hiptensor
         return strides;
     }
 
+    // Get count of element of a tensor. Note that the count is 1 if the rank of tensor is 0.
     template <typename T>
     static inline T elementsFromLengths(std::vector<T> const& lengths)
     {

diff --git a/library/src/reduction/CMakeLists.txt b/library/src/reduction/CMakeLists.txt
@@ -33,6 +33,48 @@ set(HIPTENSOR_REDUCTION_SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_cpu_reference.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_cpu_reference_instances.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_2_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_3_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_2_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_3_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_4_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_2_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_3_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_4_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_5_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_1_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_2_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_3_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_4_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_5_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_6_f16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_2_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_3_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_2_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_3_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_4_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_2_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_3_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_4_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_5_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_1_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_2_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_3_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_4_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_5_bf16_f32_instance.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_6_bf16_f32_instance.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_f32_f32_instance.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_f32_f32_instance.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_f32_f32_instance.cpp

diff --git a/library/src/reduction/hiptensor_reduction.cpp b/library/src/reduction/hiptensor_reduction.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
+#include <set>
 #include <unordered_set>
 
 #include "handle.hpp"
@@ -74,8 +75,7 @@ namespace
         using hiptensor::Logger;
         auto& logger = Logger::instance();
         char  msg[2048];
-        if(!handle || !alpha || !A || !descA || !modeA || !beta || !C || !descC || !modeC || !D
-           || !descD || !modeD)
+        if(!handle || !alpha || !A || !descA || !modeA || !beta || !descC || !D || !descD)
         {
             auto errorCode         = HIPTENSOR_STATUS_NOT_INITIALIZED;
             auto printErrorMessage = [&logger, errorCode](const std::string& paramName) {
@@ -111,18 +111,10 @@ namespace
             {
                 printErrorMessage("beta");
             }
-            if(!C)
-            {
-                printErrorMessage("C");
-            }
             if(!descC)
             {
                 printErrorMessage("descC");
             }
-            if(!modeC)
-            {
-                printErrorMessage("modeC");
-            }
             if(!D)
             {
                 printErrorMessage("D");
@@ -131,10 +123,6 @@ namespace
             {
                 printErrorMessage("descD");
             }
-            if(!modeD)
-            {
-                printErrorMessage("modeD");
-            }
             return errorCode;
         }
 
@@ -161,6 +149,22 @@ namespace
             return errorCode;
         }
 
+        auto modeSetA = std::set(modeA, modeA + descA->mLengths.size());
+        auto modeSetC = std::set(modeC, modeC + descC->mLengths.size());
+        if(descA->mLengths.size() < descC->mLengths.size() || !(*descC == *descD)
+           || !std::includes(
+               modeSetA.cbegin(), modeSetA.cend(), modeSetC.cbegin(), modeSetC.cend()))
+        {
+            auto errorCode = HIPTENSOR_STATUS_NOT_SUPPORTED;
+            snprintf(msg,
+                     sizeof(msg),
+                     "Unsupported Data Error : The descriptor of C and D should be same and "
+                     " modes of C should be subset of modes A. (%s)",
+                     hiptensorGetErrorString(errorCode));
+            logger->logError("hiptensorReduction", msg);
+            return errorCode;
+        }
+
         return HIPTENSOR_STATUS_SUCCESS;
     }
 }
@@ -239,7 +243,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
         auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
         snprintf(msg,
                  sizeof(msg),
-                 "Internal Error : No Kernels Found (%s)",
+                 "Internal Error : ReductionSolutionInstances is empty (%s)",
                  hiptensorGetErrorString(errorCode));
         logger->logError("hiptensorReduction", msg);
         return errorCode;
@@ -250,9 +254,16 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
     auto ADataType    = descA->mType;
     auto DDataType    = descD->mType;
 
+    auto internalTypeCompute = typeCompute;
+    if(typeCompute == HIPTENSOR_COMPUTE_16F || typeCompute == HIPTENSOR_COMPUTE_16BF)
+    {
+        // CK does not support f16 or bf16 as compute type
+        internalTypeCompute = HIPTENSOR_COMPUTE_32F;
+    }
+
     // Query reduction solutions for the correct reduction operation and type
     auto solutionQ = instances->querySolutions(ADataType,
-                                               typeCompute,
+                                               internalTypeCompute,
                                                DDataType,
                                                rankA,
                                                numReduceDim,
@@ -265,7 +276,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
         auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
         snprintf(msg,
                  sizeof(msg),
-                 "Internal Error : No Kernels Found (%s)",
+                 "Internal Error : querySolutions returns 0 kernel. (%s)",
                  hiptensorGetErrorString(errorCode));
         logger->logError("hiptensorReduction", msg);
         return errorCode;
@@ -282,6 +293,17 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
         betaD = hiptensor::readVal<double>(beta, typeCompute);
     }
 
+    if(C && C != D)
+    {
+        // CK API can only process $D = alpha * reduce(A) + beta * D$
+        // Need to copy C to D if C != D
+        CHECK_HIP_ERROR(hipMemcpy(D,
+                                  C,
+                                  hiptensor::elementsFromLengths(descC->mLengths)
+                                      * hiptensor::hipDataTypeSize(descC->mType),
+                                  hipMemcpyDeviceToDevice));
+    }
+
     for(auto [_, pSolution] : solutionQ.solutions())
     {
         // Perform reduction with timing if LOG_LEVEL_PERF_TRACE
@@ -296,11 +318,11 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
             }:
         StreamConfig{stream, false};
         auto [isSupported, time] = (*pSolution)(descA->mLengths,
-                                                {},
+                                                descA->mStrides,
                                                 {modeA, modeA + descA->mLengths.size()},
-                                                descC->mLengths,
-                                                {},
-                                                {modeC, modeC + descC->mLengths.size()},
+                                                descD->mLengths,
+                                                descD->mStrides,
+                                                {modeD, modeD + descD->mLengths.size()},
                                                 alphaD,
                                                 betaD,
                                                 A,
@@ -347,7 +369,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t*           handle,
     auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
     snprintf(msg,
              sizeof(msg),
-             "Selected kernel is unable to solve the problem (%s)",
+             "No kernel is able to solve the problem (%s)",
              hiptensorGetErrorString(errorCode));
     logger->logError("hiptensorReduction", msg);
     return errorCode;

diff --git a/library/src/reduction/reduction_cpu_reference.cpp b/library/src/reduction/reduction_cpu_reference.cpp
@@ -48,15 +48,22 @@ hiptensorStatus_t hiptensorReductionReference(const void*
     auto ADataType    = descA->mType;
     auto DDataType    = descD->mType;
 
+    auto internalTypeCompute = typeCompute;
+    if(typeCompute == HIPTENSOR_COMPUTE_16F || typeCompute == HIPTENSOR_COMPUTE_16BF)
+    {
+        // CK does not support f16 or bf16 as compute type
+        internalTypeCompute = HIPTENSOR_COMPUTE_32F;
+    }
+
     auto& instances = hiptensor::ReductionCpuReferenceInstances::instance();
     auto  solutionQ = instances->querySolutions(ADataType,
-                                               typeCompute,
+                                               internalTypeCompute,
                                                DDataType,
                                                rankA,
                                                numReduceDim,
                                                opReduce,
-                                               true, // @TODO hardcode
-                                               false); // @TODO hardcode
+                                               true, // propagateNan
+                                               false); // outputIndex
 
     double alphaD;
     if(alpha != nullptr)
@@ -69,17 +76,27 @@ hiptensorStatus_t hiptensorReductionReference(const void*
         betaD = hiptensor::readVal<double>(beta, typeCompute);
     }
 
+    if(C && C != D)
+    {
+        // CK API can only process $D = alpha * reduce(A) + beta * D$
+        // Need to copy C to D if C != D
+        CHECK_HIP_ERROR(hipMemcpy(D,
+                                  C,
+                                  hiptensor::elementsFromLengths(descC->mLengths)
+                                      * hiptensor::hipDataTypeSize(descC->mType),
+                                  hipMemcpyHostToHost));
+    }
+
     for(auto [_, pSolution] : solutionQ.solutions())
     {
         // Perform reduction with timing if LOG_LEVEL_PERF_TRACE
         auto streamConfig        = StreamConfig{stream, false};
         auto [isSupported, time] = (*pSolution)(descA->mLengths,
-                                                // @todo pass stride from descA
-                                                {},
+                                                descA->mStrides,
                                                 {modeA, modeA + descA->mLengths.size()},
-                                                descC->mLengths,
-                                                {},
-                                                {modeC, modeC + descC->mLengths.size()},
+                                                descD->mLengths,
+                                                descD->mStrides,
+                                                {modeD, modeD + descD->mLengths.size()},
                                                 alphaD,
                                                 betaD,
                                                 A,