Skip to content

Commit

Permalink
Reduce f16 (#261)
Browse files Browse the repository at this point in the history
- Support f16/f16, bf16/bf16, f16/f32, bf16/f32 reduction
- Use f32 as compute type for f16/f16, bf16/bf16
- Support independent C and D
- Add another CK instance with InSrcVectorDim==1 which supports far right dim
- Support reducing a tensor to a scalar
- Support permutation of output of reduction
- Rename label [Reduced Dims] to [Output Dims] in reduction test config file 
- Commnet out all test cases that beta != 0. Will add the test cases back when the CK fix is merged into amd_master
- Fix bugs
  • Loading branch information
CongMa13 authored Aug 20, 2024
1 parent cc1a585 commit e778abc
Show file tree
Hide file tree
Showing 111 changed files with 2,667 additions and 1,760 deletions.
9 changes: 8 additions & 1 deletion library/include/hiptensor/internal/hiptensor_utility.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
#define HIPTENSOR_UTILITY_INTERNAL_HPP

#include <fstream>
#include <hip/hip_complex.h>
#include <hip/hip_runtime.h>
#include <iostream>
#include <hip/hip_complex.h>

#include "../hiptensor_types.hpp"
#include "types_ext.hpp"
Expand Down Expand Up @@ -131,6 +131,13 @@ void hiptensorPrintElementsToFile(std::ofstream& fs, F* output, size_t size, std
return;
}

bool inline operator==(const hiptensorTensorDescriptor_t& lhs,
const hiptensorTensorDescriptor_t& rhs)
{
return lhs.mType == rhs.mType && lhs.mLengths == rhs.mLengths && lhs.mStrides == rhs.mStrides
&& lhs.mUnaryOp == rhs.mUnaryOp;
}

namespace std
{
static ostream& operator<<(ostream& os, const hiptensorTensorDescriptor_t& desc)
Expand Down
22 changes: 13 additions & 9 deletions library/src/hiptensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,17 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han
return HIPTENSOR_STATUS_NOT_INITIALIZED;
}

if((lens == nullptr)
if((lens == nullptr && strides != nullptr)
|| ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
&& (dataType != HIP_R_64F) && (dataType != HIP_C_32F)
&& (dataType != HIP_C_64F))
&& (dataType != HIP_R_64F) && (dataType != HIP_C_32F) && (dataType != HIP_C_64F))
|| ((unaryOp != HIPTENSOR_OP_IDENTITY) && (unaryOp != HIPTENSOR_OP_SQRT)))
{
auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
if(lens == nullptr)
if(lens == nullptr && strides != nullptr)
{
snprintf(msg,
sizeof(msg),
"Tensor Initialization Error : lens = nullptr (%s)",
"Tensor Initialization Error : lens = nullptr and strides != nullptr (%s)",
hiptensorGetErrorString(errorCode));
}
else if((unaryOp != HIPTENSOR_OP_IDENTITY) && (unaryOp != HIPTENSOR_OP_SQRT))
Expand Down Expand Up @@ -200,10 +199,15 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han
else
{
// Re-construct strides from lengths, assuming packed.
std::vector<std::size_t> l(lens, lens + numModes);
std::vector<std::size_t> s = hiptensor::stridesFromLengths(l);

*desc = {dataType, l, s, unaryOp};
if(numModes > 0)
{
auto lensVector = std::vector<std::size_t>(lens, lens + numModes);
*desc = {dataType, lensVector, hiptensor::stridesFromLengths(lensVector), unaryOp};
}
else
{
*desc = {dataType, std::vector<std::size_t>(), std::vector<std::size_t>(), unaryOp};
}
}

return HIPTENSOR_STATUS_SUCCESS;
Expand Down
3 changes: 2 additions & 1 deletion library/src/include/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ namespace hiptensor
}

// Re-construct strides from lengths, assuming packed.
std::vector<std::size_t> strides(lengths.size(), 1);
std::vector<T> strides(lengths.size(), 1);
if(!col_major)
{
strides.back() = 1;
Expand All @@ -68,6 +68,7 @@ namespace hiptensor
return strides;
}

// Get count of element of a tensor. Note that the count is 1 if the rank of tensor is 0.
template <typename T>
static inline T elementsFromLengths(std::vector<T> const& lengths)
{
Expand Down
42 changes: 42 additions & 0 deletions library/src/reduction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,48 @@ set(HIPTENSOR_REDUCTION_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/reduction_cpu_reference.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_cpu_reference_instances.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_2_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_3_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_2_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_3_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_4_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_2_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_3_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_4_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_5_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_1_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_2_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_3_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_4_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_5_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_6_f16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_2_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_3_3_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_2_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_3_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_4_4_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_2_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_3_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_4_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_5_5_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_1_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_2_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_3_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_4_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_5_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_6_6_bf16_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_1_1_f32_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_1_f32_f32_instance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reduction_solution_2_2_f32_f32_instance.cpp
Expand Down
66 changes: 44 additions & 22 deletions library/src/reduction/hiptensor_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*
*******************************************************************************/
#include <hiptensor/hiptensor.hpp>
#include <set>
#include <unordered_set>

#include "handle.hpp"
Expand Down Expand Up @@ -74,8 +75,7 @@ namespace
using hiptensor::Logger;
auto& logger = Logger::instance();
char msg[2048];
if(!handle || !alpha || !A || !descA || !modeA || !beta || !C || !descC || !modeC || !D
|| !descD || !modeD)
if(!handle || !alpha || !A || !descA || !modeA || !beta || !descC || !D || !descD)
{
auto errorCode = HIPTENSOR_STATUS_NOT_INITIALIZED;
auto printErrorMessage = [&logger, errorCode](const std::string& paramName) {
Expand Down Expand Up @@ -111,18 +111,10 @@ namespace
{
printErrorMessage("beta");
}
if(!C)
{
printErrorMessage("C");
}
if(!descC)
{
printErrorMessage("descC");
}
if(!modeC)
{
printErrorMessage("modeC");
}
if(!D)
{
printErrorMessage("D");
Expand All @@ -131,10 +123,6 @@ namespace
{
printErrorMessage("descD");
}
if(!modeD)
{
printErrorMessage("modeD");
}
return errorCode;
}

Expand All @@ -161,6 +149,22 @@ namespace
return errorCode;
}

auto modeSetA = std::set(modeA, modeA + descA->mLengths.size());
auto modeSetC = std::set(modeC, modeC + descC->mLengths.size());
if(descA->mLengths.size() < descC->mLengths.size() || !(*descC == *descD)
|| !std::includes(
modeSetA.cbegin(), modeSetA.cend(), modeSetC.cbegin(), modeSetC.cend()))
{
auto errorCode = HIPTENSOR_STATUS_NOT_SUPPORTED;
snprintf(msg,
sizeof(msg),
"Unsupported Data Error : The descriptor of C and D should be same and "
" modes of C should be subset of modes A. (%s)",
hiptensorGetErrorString(errorCode));
logger->logError("hiptensorReduction", msg);
return errorCode;
}

return HIPTENSOR_STATUS_SUCCESS;
}
}
Expand Down Expand Up @@ -239,7 +243,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
snprintf(msg,
sizeof(msg),
"Internal Error : No Kernels Found (%s)",
"Internal Error : ReductionSolutionInstances is empty (%s)",
hiptensorGetErrorString(errorCode));
logger->logError("hiptensorReduction", msg);
return errorCode;
Expand All @@ -250,9 +254,16 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
auto ADataType = descA->mType;
auto DDataType = descD->mType;

auto internalTypeCompute = typeCompute;
if(typeCompute == HIPTENSOR_COMPUTE_16F || typeCompute == HIPTENSOR_COMPUTE_16BF)
{
// CK does not support f16 or bf16 as compute type
internalTypeCompute = HIPTENSOR_COMPUTE_32F;
}

// Query reduction solutions for the correct reduction operation and type
auto solutionQ = instances->querySolutions(ADataType,
typeCompute,
internalTypeCompute,
DDataType,
rankA,
numReduceDim,
Expand All @@ -265,7 +276,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
snprintf(msg,
sizeof(msg),
"Internal Error : No Kernels Found (%s)",
"Internal Error : querySolutions returns 0 kernel. (%s)",
hiptensorGetErrorString(errorCode));
logger->logError("hiptensorReduction", msg);
return errorCode;
Expand All @@ -282,6 +293,17 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
betaD = hiptensor::readVal<double>(beta, typeCompute);
}

if(C && C != D)
{
// CK API can only process $D = alpha * reduce(A) + beta * D$
// Need to copy C to D if C != D
CHECK_HIP_ERROR(hipMemcpy(D,
C,
hiptensor::elementsFromLengths(descC->mLengths)
* hiptensor::hipDataTypeSize(descC->mType),
hipMemcpyDeviceToDevice));
}

for(auto [_, pSolution] : solutionQ.solutions())
{
// Perform reduction with timing if LOG_LEVEL_PERF_TRACE
Expand All @@ -296,11 +318,11 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
}:
StreamConfig{stream, false};
auto [isSupported, time] = (*pSolution)(descA->mLengths,
{},
descA->mStrides,
{modeA, modeA + descA->mLengths.size()},
descC->mLengths,
{},
{modeC, modeC + descC->mLengths.size()},
descD->mLengths,
descD->mStrides,
{modeD, modeD + descD->mLengths.size()},
alphaD,
betaD,
A,
Expand Down Expand Up @@ -347,7 +369,7 @@ hiptensorStatus_t hiptensorReduction(const hiptensorHandle_t* handle,
auto errorCode = HIPTENSOR_STATUS_INTERNAL_ERROR;
snprintf(msg,
sizeof(msg),
"Selected kernel is unable to solve the problem (%s)",
"No kernel is able to solve the problem (%s)",
hiptensorGetErrorString(errorCode));
logger->logError("hiptensorReduction", msg);
return errorCode;
Expand Down
33 changes: 25 additions & 8 deletions library/src/reduction/reduction_cpu_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,22 @@ hiptensorStatus_t hiptensorReductionReference(const void*
auto ADataType = descA->mType;
auto DDataType = descD->mType;

auto internalTypeCompute = typeCompute;
if(typeCompute == HIPTENSOR_COMPUTE_16F || typeCompute == HIPTENSOR_COMPUTE_16BF)
{
// CK does not support f16 or bf16 as compute type
internalTypeCompute = HIPTENSOR_COMPUTE_32F;
}

auto& instances = hiptensor::ReductionCpuReferenceInstances::instance();
auto solutionQ = instances->querySolutions(ADataType,
typeCompute,
internalTypeCompute,
DDataType,
rankA,
numReduceDim,
opReduce,
true, // @TODO hardcode
false); // @TODO hardcode
true, // propagateNan
false); // outputIndex

double alphaD;
if(alpha != nullptr)
Expand All @@ -69,17 +76,27 @@ hiptensorStatus_t hiptensorReductionReference(const void*
betaD = hiptensor::readVal<double>(beta, typeCompute);
}

if(C && C != D)
{
// CK API can only process $D = alpha * reduce(A) + beta * D$
// Need to copy C to D if C != D
CHECK_HIP_ERROR(hipMemcpy(D,
C,
hiptensor::elementsFromLengths(descC->mLengths)
* hiptensor::hipDataTypeSize(descC->mType),
hipMemcpyHostToHost));
}

for(auto [_, pSolution] : solutionQ.solutions())
{
// Perform reduction with timing if LOG_LEVEL_PERF_TRACE
auto streamConfig = StreamConfig{stream, false};
auto [isSupported, time] = (*pSolution)(descA->mLengths,
// @todo pass stride from descA
{},
descA->mStrides,
{modeA, modeA + descA->mLengths.size()},
descC->mLengths,
{},
{modeC, modeC + descC->mLengths.size()},
descD->mLengths,
descD->mStrides,
{modeD, modeD + descD->mLengths.size()},
alphaD,
betaD,
A,
Expand Down
Loading

0 comments on commit e778abc

Please sign in to comment.