From b76153346e436cf423d282764ace9ed0eda07b92 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 15 Nov 2023 22:31:58 +0000
Subject: [PATCH] Add type related files

new file:   include/native_types.hpp
new file:   include/native_types_impl.hpp
new file:   include/type_traits.hpp
new file:   include/types_ext.hpp
new file:   include/xfloat32.hpp
---
 .../hiptensor/internal/hiptensor_utility.hpp  |   5 -
 library/src/CMakeLists.txt                    |   2 +-
 .../contraction/contraction_meta_traits.hpp   |   2 +-
 .../contraction_solution_params.hpp           |   2 +-
 .../contraction_solution_params_impl.hpp      |   2 +-
 .../contraction_solution_registry.hpp         |   2 +-
 library/src/{types.cpp => data_types.cpp}     |   2 +-
 library/src/hiptensor.cpp                     |   2 +-
 library/src/include/config.hpp                | 184 +++++
 library/src/include/data_types.hpp            |  78 ++
 .../{types_impl.hpp => data_types_impl.hpp}   |   8 +-
 library/src/include/native_types.hpp          | 119 +++
 library/src/include/native_types_impl.hpp     |  36 +
 library/src/include/type_traits.hpp           | 749 ++++++++++++++++++
 library/src/include/types.hpp                 |  62 +-
 library/src/include/types_ext.hpp             | 195 +++++
 library/src/include/xfloat32.hpp              | 349 ++++++++
 .../src/permutation/permutation_ck_impl.hpp   |   2 +-
 .../permutation_cpu_reference_impl.hpp        |   2 +-
 test/01_contraction/contraction_test.cpp      |   2 +-
 test/02_permutation/permutation_resource.cpp  |   2 +-
 test/02_permutation/permutation_test.cpp      |   2 +-
 test/utils.hpp                                |  16 +-
 23 files changed, 1734 insertions(+), 91 deletions(-)
 rename library/src/{types.cpp => data_types.cpp} (99%)
 create mode 100644 library/src/include/config.hpp
 create mode 100644 library/src/include/data_types.hpp
 rename library/src/include/{types_impl.hpp => data_types_impl.hpp} (97%)
 create mode 100644 library/src/include/native_types.hpp
 create mode 100644 library/src/include/native_types_impl.hpp
 create mode 100644 library/src/include/type_traits.hpp
 create mode 100644 library/src/include/types_ext.hpp
 create mode 100644 library/src/include/xfloat32.hpp
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index 5fd9e331..f2df2dd2 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -131,11 +131,6 @@ namespace std
 
         return os;
     }
-    static ostream& operator<<(ostream& os, const _Float16 value)
-    {
-        os << static_cast<float>(value);
-        return os;
-    }
 }
 
 #endif // HIPTENSOR_UTILITY_INTERNAL_HPP
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 801d0761..286ed2e3 100644
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -62,7 +62,7 @@ set(HIPTENSOR_CORE_SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/hiptensor.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/logger.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/performance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/types.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/data_types.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/hip_device.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/handle.cpp
 )
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index e018661b..4fa7acf7 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -34,8 +34,8 @@
 #include <element_wise_operation.hpp>
 
 // hiptensor includes
+#include "data_types.hpp"
 #include "meta_traits.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp
index f55d3c85..ec9de45c 100644
--- a/library/src/contraction/contraction_solution_params.hpp
+++ b/library/src/contraction/contraction_solution_params.hpp
@@ -30,7 +30,7 @@
 #include <hiptensor/hiptensor_types.hpp>
 
 #include "contraction_types.hpp"
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index 194fb0d5..bff33960 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -29,8 +29,8 @@
 
 #include "contraction_meta_traits.hpp"
 #include "contraction_solution_params.hpp"
+#include "data_types.hpp"
 #include "hash.hpp"
-#include "types.hpp"
 
 namespace std
 {
diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp
index 039e9e14..d1b80ec5 100644
--- a/library/src/contraction/contraction_solution_registry.hpp
+++ b/library/src/contraction/contraction_solution_registry.hpp
@@ -32,8 +32,8 @@
 #include <vector>
 
 #include "contraction_types.hpp"
+#include "data_types.hpp"
 #include "singleton.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/types.cpp b/library/src/data_types.cpp
similarity index 99%
rename from library/src/types.cpp
rename to library/src/data_types.cpp
index 9cfb3290..b270973d 100644
--- a/library/src/types.cpp
+++ b/library/src/data_types.cpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 3016ef64..9740d2a8 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -27,9 +27,9 @@
 
 #include <hiptensor/hiptensor.hpp>
 
+#include "data_types.hpp"
 #include "handle.hpp"
 #include "logger.hpp"
-#include "types.hpp"
 #include "util.hpp"
 
 hiptensorStatus_t hiptensorCreate(hiptensorHandle_t** handle)
diff --git a/library/src/include/config.hpp b/library/src/include/config.hpp
new file mode 100644
index 00000000..91e0bf53
--- /dev/null
+++ b/library/src/include/config.hpp
@@ -0,0 +1,184 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_CONFIG_HPP
+#define HIPTENSOR_CONFIG_HPP
+
+namespace hiptensor
+{
+
+///
+/// Architecture support
+/// Guaranteed symbols:
+/// HIPTENSOR_ARCH_GFX908
+/// HIPTENSOR_ARCH_GFX90a
+/// HIPTENSOR_ARCH_GFX940
+/// HIPTENSOR_ARCH_GFX941
+/// HIPTENSOR_ARCH_GFX942
+/// HIPTENSOR_ARCH_GFX1100
+/// HIPTENSOR_ARCH_GFX1101
+/// HIPTENSOR_ARCH_GFX1102
+#if defined(__gfx908__)
+#define HIPTENSOR_ARCH_GFX908 __gfx908__
+#elif defined(__gfx90a__)
+#define HIPTENSOR_ARCH_GFX90A __gfx90a__
+#elif defined(__gfx940__)
+#define HIPTENSOR_ARCH_GFX940 __gfx940__
+#elif defined(__gfx941__)
+#define HIPTENSOR_ARCH_GFX941 __gfx941__
+#elif defined(__gfx942__)
+#define HIPTENSOR_ARCH_GFX942 __gfx942__
+#elif defined(__gfx1100__)
+#define HIPTENSOR_ARCH_GFX1100 __gfx1100__
+#elif defined(__gfx1101__)
+#define HIPTENSOR_ARCH_GFX1101 __gfx1101__
+#elif defined(__gfx1102__)
+#define HIPTENSOR_ARCH_GFX1102 __gfx1102__
+#else
+#define HIPTENSOR_ARCH_HOST 1
+#endif
+
+#if !defined(HIPTENSOR_ARCH_GFX908)
+#define HIPTENSOR_ARCH_GFX908 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX90A)
+#define HIPTENSOR_ARCH_GFX90A 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX940)
+#define HIPTENSOR_ARCH_GFX940 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX941)
+#define HIPTENSOR_ARCH_GFX941 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX942)
+#define HIPTENSOR_ARCH_GFX942 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX1100)
+#define HIPTENSOR_ARCH_GFX1100 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX1101)
+#define HIPTENSOR_ARCH_GFX1101 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX1102)
+#define HIPTENSOR_ARCH_GFX1102 0
+#endif
+#if !defined(HIPTENSOR_ARCH_HOST)
+#define HIPTENSOR_ARCH_HOST 0
+#endif
+
+///
+/// Architecture configuration
+/// Guaranteed symbols:
+/// HIPTENSOR_ARCH_GFX9
+/// HIPTENSOR_ARCH_GFX11
+/// HIPTENSOR_WAVE64_MODE
+/// HIPTENSOR_WAVE32_MODE
+/// HIPTENSOR_BLOCK_DIM_16_SUPPORTED
+/// HIPTENSOR_BLOCK_DIM_32_SUPPORTED
+///
+#if HIPTENSOR_ARCH_GFX908 || HIPTENSOR_ARCH_GFX90A || HIPTENSOR_ARCH_GFX940 \
+    || HIPTENSOR_ARCH_GFX941 || HIPTENSOR_ARCH_GFX942
+#define HIPTENSOR_ARCH_GFX9 1
+#define HIPTENSOR_WAVE64_MODE 1
+#define HIPTENSOR_BLOCK_DIM_16_SUPPORTED 1
+#define HIPTENSOR_BLOCK_DIM_32_SUPPORTED 1
+#endif
+
+#if HIPTENSOR_ARCH_GFX1100 || HIPTENSOR_ARCH_GFX1101 || HIPTENSOR_ARCH_GFX1102
+#define HIPTENSOR_ARCH_GFX11 1
+#define HIPTENSOR_WAVE32_MODE 1
+#define HIPTENSOR_BLOCK_DIM_16_SUPPORTED 1
+#endif
+
+#if !defined(HIPTENSOR_ARCH_GFX9)
+#define HIPTENSOR_ARCH_GFX9 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX11)
+#define HIPTENSOR_ARCH_GFX11 0
+#endif
+#if !defined(HIPTENSOR_WAVE64_MODE)
+#define HIPTENSOR_WAVE64_MODE 0
+#endif
+#if !defined(HIPTENSOR_WAVE32_MODE)
+#define HIPTENSOR_WAVE32_MODE 0
+#endif
+#if !defined(HIPTENSOR_BLOCK_DIM_16_SUPPORTED)
+#define HIPTENSOR_BLOCK_DIM_16_SUPPORTED 0
+#endif
+#if !defined(HIPTENSOR_BLOCK_DIM_32_SUPPORTED)
+#define HIPTENSOR_BLOCK_DIM_32_SUPPORTED 0
+#endif
+
+#if defined(NDEBUG)
+#define HIPTENSOR_UNSUPPORTED_IMPL(MSG)
+#else
+#define HIPTENSOR_UNSUPPORTED_IMPL(MSG) __attribute__((deprecated(MSG)))
+#endif
+
+#if defined(HIP_NO_HALF)
+#define HIPTENSOR_NO_HALF 1
+#else
+#define HIPTENSOR_NO_HALF 0
+#endif // HIP_NO_HALF
+
+#if HIPTENSOR_NO_HALF || (!HIPTENSOR_NO_HALF && defined(__HIP_NO_HALF_CONVERSIONS__))
+#define HIPTENSOR_TESTS_NO_HALF 1
+#else
+#define HIPTENSOR_TESTS_NO_HALF 0
+#endif // !HIPTENSOR_NO_HALF && defined(__HIP_NO_HALF_CONVERSIONS__)
+
+///
+/// Sanity checks
+///
+#if HIPTENSOR_ARCH_GFX11
+    static_assert((bool)(HIPTENSOR_WAVE32_MODE) && !(bool)(HIPTENSOR_WAVE64_MODE),
+                  "hipTensor supports only wave32 for gfx11 arch");
+    static_assert((bool)(HIPTENSOR_BLOCK_DIM_16_SUPPORTED)
+                      && !(bool)(HIPTENSOR_BLOCK_DIM_32_SUPPORTED),
+                  "hipTensor supports only block size of 16 for gfx11 arch");
+#endif
+
+#if HIPTENSOR_ARCH_GFX9
+    static_assert(!(bool)(HIPTENSOR_WAVE32_MODE) && (bool)(HIPTENSOR_WAVE64_MODE),
+                  "hipTensor supports only wave64 for gfx9 arch");
+    static_assert((bool)(HIPTENSOR_BLOCK_DIM_16_SUPPORTED)
+                      && (bool)(HIPTENSOR_BLOCK_DIM_32_SUPPORTED),
+                  "hipTensor requires block size of 16 and 32 for gfx9 arch");
+#endif
+
+///
+/// Host and Device symbols
+///
+#define HIPTENSOR_DEVICE __device__
+
+#define HIPTENSOR_HOST __host__
+
+#define HIPTENSOR_HOST_DEVICE HIPTENSOR_HOST HIPTENSOR_DEVICE
+
+#define HIPTENSOR_KERNEL __global__
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_CONFIG_HPP
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
new file mode 100644
index 00000000..42197650
--- /dev/null
+++ b/library/src/include/data_types.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_LIBRARY_DATA_TYPES_HPP
+#define HIPTENSOR_LIBRARY_DATA_TYPES_HPP
+
+// clang-format off
+// Include order needs to be preserved
+#include <hip/library_types.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+#include <iostream>
+
+#include <hiptensor/hiptensor_types.hpp>
+
+// clang-format on
+
+namespace hiptensor
+{
+    // Used to map to empty tensors
+    struct NoneType;
+
+    static constexpr hipDataType NONE_TYPE = (hipDataType)31;
+
+    // Map type to runtime HipDataType
+    template <typename T>
+    struct HipDataType;
+
+    template <typename T>
+    static constexpr auto HipDataType_v = HipDataType<T>::value;
+
+    // Get data size in bytes from id
+    uint32_t hipDataTypeSize(hipDataType id);
+
+    // Convert hipDataType to hiptensorComputeType_t
+    hiptensorComputeType_t convertToComputeType(hipDataType hipType);
+
+    // Read a single value from void pointer, casted to T
+    template <typename T>
+    T readVal(void const* value, hipDataType id);
+
+    template <typename T>
+    T readVal(void const* value, hiptensorComputeType_t id);
+
+} // namespace hiptensor
+
+bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
+bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
+
+bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
+bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
+
+#include "data_types_impl.hpp"
+
+#endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP
diff --git a/library/src/include/types_impl.hpp b/library/src/include/data_types_impl.hpp
similarity index 97%
rename from library/src/include/types_impl.hpp
rename to library/src/include/data_types_impl.hpp
index 7a07d0b0..7df6d7d9 100644
--- a/library/src/include/types_impl.hpp
+++ b/library/src/include/data_types_impl.hpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#ifndef HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
-#define HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
+#ifndef HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
+#define HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
 
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
@@ -217,4 +217,4 @@ namespace hiptensor
 
 } // namespace hiptensor
 
-#endif // HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
+#endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
diff --git a/library/src/include/native_types.hpp b/library/src/include/native_types.hpp
new file mode 100644
index 00000000..c8df80e4
--- /dev/null
+++ b/library/src/include/native_types.hpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_NATIVE_TYPES_HPP
+#define HIPTENSOR_NATIVE_TYPES_HPP
+
+#if !defined(__HIPCC_RTC__)
+#include <array>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_vector_types.h>
+#include <type_traits>
+#include <utility>
+#endif // !__HIPCC_RTC__
+
+#include "xfloat32.hpp"
+
+namespace hiptensor
+{
+
+    /**
+ * \defgroup DataTypes Data Type Metadata
+ *
+ * @brief Definition and metadata on supported data types of matrices.
+ *
+ * @{
+ *
+ * Native Data Types:
+ * float64_t = f64 = double
+ * float = f32
+ * _Float16 = f16
+ * int8
+ * uint8
+ * int16
+ * int32
+ * uint32
+ *
+ *
+ * Non-Native Data Types:
+ * h16 = __half
+ * bf16 = bfloat16
+ *
+ */
+
+    // Native types
+    using float16_t = _Float16;
+    using float32_t = float;
+    using float64_t = double;
+
+#if !defined(__HIPCC_RTC__)
+
+    using int8_t   = ::int8_t;
+    using uint8_t  = ::uint8_t;
+    using int16_t  = ::int16_t;
+    using uint16_t = ::uint16_t;
+    using int32_t  = ::int32_t;
+    using uint32_t = ::uint32_t;
+    using int64_t  = ::int64_t;
+    using uint64_t = ::uint64_t;
+    using index_t  = ::int32_t;
+
+#else
+
+    using int8_t   = __hip_internal::int8_t;
+    using uint8_t  = __hip_internal::uint8_t;
+    using int16_t  = __hip_internal::int16_t;
+    using uint16_t = __hip_internal::uint16_t;
+    using int32_t  = __hip_internal::int32_t;
+    using uint32_t = __hip_internal::uint32_t;
+    using int64_t  = __hip_internal::int64_t;
+    using uint64_t = __hip_internal::uint64_t;
+    using index_t  = __hip_internal::int32_t;
+
+#endif // !defined(__HIPCC_RTC__)
+
+    // Non-native types
+    using bfloat16_t = hip_bfloat16;
+
+#if !HIPTENSOR_NO_HALF
+    using hfloat16_t = __half;
+#endif // !HIPTENSOR_NO_HALF
+
+    using xfloat32_t = hiptensor_xfloat32;
+
+    // clang-format off
+
+
+} // namespace hiptensor
+
+// Add in some extensions to basic type support.
+// Some of these are required for vector implementations.
+// #include "type_traits.hpp"
+// #include "types_ext.hpp"
+
+#include "native_types_impl.hpp"
+
+#endif // HIPTENSOR_NATIVE_TYPES_HPP
diff --git a/library/src/include/native_types_impl.hpp b/library/src/include/native_types_impl.hpp
new file mode 100644
index 00000000..1b29b459
--- /dev/null
+++ b/library/src/include/native_types_impl.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_NATIVE_TYPES_IMPL_HPP
+#define HIPTENSOR_NATIVE_TYPES_IMPL_HPP
+
+#include "native_types.hpp"
+
+namespace hiptensor
+{
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_NATIVE_TYPES_IMPL_HPP
diff --git a/library/src/include/type_traits.hpp b/library/src/include/type_traits.hpp
new file mode 100644
index 00000000..20d73a03
--- /dev/null
+++ b/library/src/include/type_traits.hpp
@@ -0,0 +1,749 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_TYPE_TRAITS_HPP
+#define HIPTENSOR_TYPE_TRAITS_HPP
+
+#if !defined(__HIPCC_RTC__)
+
+#include <cfloat>
+
+#else
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define FLT_MAX __FLT_MAX__
+#define FLT_MIN __FLT_MIN__
+#define HUGE_VALF (__builtin_huge_valf())
+
+#endif // !defined(__HIPCC_RTC__)
+
+#include "native_types.hpp"
+
+namespace hiptensor
+{
+    namespace detail
+    {
+        struct Fp16Bits
+        {
+            union
+            {
+                uint16_t  i16;
+                float16_t f16;
+#if !HIPTENSOR_NO_HALF
+                hfloat16_t h16;
+#endif // !HIPTENSOR_NO_HALF
+                bfloat16_t b16;
+            };
+            constexpr Fp16Bits(uint16_t initVal)
+                : i16(initVal)
+            {
+            }
+#define TEST_TEST 1
+            constexpr Fp16Bits(float16_t initVal)
+                : f16(initVal)
+            {
+            }
+#if !HIPTENSOR_NO_HALF
+            constexpr Fp16Bits(hfloat16_t initVal)
+                : h16(initVal)
+            {
+            }
+#endif
+            constexpr Fp16Bits(bfloat16_t initVal)
+                : b16(initVal)
+            {
+            }
+        };
+
+        struct Fp32Bits
+        {
+            union
+            {
+                uint32_t   i32;
+                float32_t  f32;
+                xfloat32_t xf32;
+            };
+            constexpr Fp32Bits(uint32_t initVal)
+                : i32(initVal)
+            {
+            }
+            constexpr Fp32Bits(float32_t initVal)
+                : f32(initVal)
+            {
+            }
+            constexpr Fp32Bits(xfloat32_t initVal)
+                : xf32(initVal)
+            {
+            }
+        };
+
+    } // namespace detail
+} // namespace hiptensor
+
+///////////////////////////////////////////////////////////
+/////////////  std replacements for hipRTC  ///////////////
+///////////////////////////////////////////////////////////
+#if defined(__HIPCC_RTC__)
+namespace std
+{
+    template <typename T>
+    class numeric_limits
+    {
+    public:
+        HIPTENSOR_HOST_DEVICE static constexpr T min() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T lowest() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T max() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T epsilon() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T round_error() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T infinity() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T quiet_NaN() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T signaling_NaN() noexcept;
+        HIPTENSOR_HOST_DEVICE static constexpr T denorm_min() noexcept;
+    };
+
+    template <bool B, class T = void>
+    using enable_if_t = typename enable_if<B, T>::type;
+
+    template <bool B, class T, class F>
+    struct conditional
+    {
+    };
+
+    template <class T, class F>
+    struct conditional<true, T, F>
+    {
+        using type = T;
+    };
+
+    template <class T, class F>
+    struct conditional<false, T, F>
+    {
+        using type = F;
+    };
+
+    template <bool B, class T, class F>
+    using conditional_t = typename conditional<B, T, F>::type;
+
+    template <typename T>
+    HIPTENSOR_HOST_DEVICE const T& max(const T& a, const T& b)
+    {
+        return (a < b) ? b : a;
+    }
+
+    template <typename T>
+    HIPTENSOR_HOST_DEVICE const T& min(const T& a, const T& b)
+    {
+        return (b < a) ? a : b;
+    }
+
+    // Meta programming helper types.
+
+    template <bool, typename, typename>
+    struct conditional;
+
+    template <typename...>
+    struct __or_;
+
+    template <>
+    struct __or_<> : public false_type
+    {
+    };
+
+    template <typename _B1>
+    struct __or_<_B1> : public _B1
+    {
+    };
+
+    template <typename _B1, typename _B2>
+    struct __or_<_B1, _B2> : public conditional<_B1::value, _B1, _B2>::type
+    {
+    };
+
+    template <typename _B1, typename _B2, typename _B3, typename... _Bn>
+    struct __or_<_B1, _B2, _B3, _Bn...>
+        : public conditional<_B1::value, _B1, __or_<_B2, _B3, _Bn...>>::type
+    {
+    };
+
+    template <typename...>
+    struct __and_;
+
+    template <>
+    struct __and_<> : public true_type
+    {
+    };
+
+    template <typename _B1>
+    struct __and_<_B1> : public _B1
+    {
+    };
+
+    template <typename _B1, typename _B2>
+    struct __and_<_B1, _B2> : public conditional<_B1::value, _B2, _B1>::type
+    {
+    };
+
+    template <typename _B1, typename _B2, typename _B3, typename... _Bn>
+    struct __and_<_B1, _B2, _B3, _Bn...>
+        : public conditional<_B1::value, __and_<_B2, _B3, _Bn...>, _B1>::type
+    {
+    };
+
+    template <bool __v>
+    using __bool_constant = integral_constant<bool, __v>;
+
+    template <typename _Pp>
+    struct __not_ : public __bool_constant<!bool(_Pp::value)>
+    {
+    };
+
+    // remove_reference
+    template <typename T>
+    struct remove_reference
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct remove_reference<T&>
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct remove_reference<T&&>
+    {
+        typedef T type;
+    };
+
+    // is_lvalue_reference
+    template <typename>
+    struct is_lvalue_reference : public false_type
+    {
+    };
+
+    template <typename T>
+    struct is_lvalue_reference<T&> : public true_type
+    {
+    };
+
+    // is_rvalue_reference
+    template <typename>
+    struct is_rvalue_reference : public false_type
+    {
+    };
+
+    template <typename T>
+    struct is_rvalue_reference<T&&> : public true_type
+    {
+    };
+
+    // lvalue forwarding
+    template <typename T>
+    constexpr T&& forward(typename remove_reference<T>::type& __t) noexcept
+    {
+        return static_cast<T&&>(__t);
+    }
+
+    // rvalue forwarding
+    template <typename T>
+    constexpr T&& forward(typename remove_reference<T>::type&& __t) noexcept
+    {
+        static_assert(!is_lvalue_reference<T>::value,
+                      "template argument"
+                      " substituting T is an lvalue reference type");
+        return static_cast<T&&>(__t);
+    }
+
+    // remove_const
+    template <typename T>
+    struct remove_const
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct remove_const<T const>
+    {
+        typedef T type;
+    };
+
+    // remove_volatile
+    template <typename T>
+    struct remove_volatile
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct remove_volatile<T volatile>
+    {
+        typedef T type;
+    };
+
+    // remove_cv
+    template <typename T>
+    struct remove_cv
+    {
+        typedef typename remove_const<typename remove_volatile<T>::type>::type type;
+    };
+
+    // remove_extent
+    template <typename T>
+    struct remove_extent
+    {
+        typedef T type;
+    };
+
+    template <typename T, std::size_t _Size>
+    struct remove_extent<T[_Size]>
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct remove_extent<T[]>
+    {
+        typedef T type;
+    };
+
+    // is_void
+    template <typename>
+    struct __is_void_helper : public false_type
+    {
+    };
+
+    template <>
+    struct __is_void_helper<void> : public true_type
+    {
+    };
+
+    template <typename T>
+    struct is_void : public __is_void_helper<typename remove_cv<T>::type>::type
+    {
+    };
+
+    // is_reference
+    template <typename T>
+    struct is_reference : public __or_<is_lvalue_reference<T>, is_rvalue_reference<T>>::type
+    {
+    };
+
+    // is_function
+    template <typename>
+    struct is_function : public false_type
+    {
+    };
+
+    // is_object
+    template <typename T>
+    struct is_object : public __not_<__or_<is_function<T>, is_reference<T>, is_void<T>>>::type
+    {
+    };
+
+    // __is_referenceable
+    template <typename T>
+    struct __is_referenceable : public __or_<is_object<T>, is_reference<T>>::type{};
+
+    // add_pointer
+    template <typename T, bool = __or_<__is_referenceable<T>, is_void<T>>::value>
+    struct __add_pointer_helper
+    {
+        typedef T type;
+    };
+
+    template <typename T>
+    struct __add_pointer_helper<T, true>
+    {
+        typedef typename remove_reference<T>::type* type;
+    };
+
+    template <typename T>
+    struct add_pointer : public __add_pointer_helper<T>
+    {
+    };
+
+    // is_array
+    template <typename>
+    struct is_array : public false_type
+    {
+    };
+
+    template <typename T, std::size_t _Size>
+    struct is_array<T[_Size]> : public true_type
+    {
+    };
+
+    template <typename T>
+    struct is_array<T[]> : public true_type
+    {
+    };
+
+    // decay selectors
+    template <typename _Up,
+              bool _IsArray    = is_array<_Up>::value,
+              bool _IsFunction = is_function<_Up>::value>
+    struct __decay_selector;
+
+    template <typename _Up>
+    struct __decay_selector<_Up, false, false>
+    {
+        typedef typename remove_cv<_Up>::type __type;
+    };
+
+    template <typename _Up>
+    struct __decay_selector<_Up, true, false>
+    {
+        typedef typename remove_extent<_Up>::type* __type;
+    };
+
+    template <typename _Up>
+    struct __decay_selector<_Up, false, true>
+    {
+        typedef typename add_pointer<_Up>::type __type;
+    };
+
+    // decay
+    template <typename T>
+    class decay
+    {
+        typedef typename remove_reference<T>::type __remove_type;
+
+    public:
+        typedef typename __decay_selector<__remove_type>::__type type;
+    };
+
+    template <typename T>
+    using decay_t = typename decay<T>::type;
+
+} // namespace std
+#endif
+
+namespace std
+{
+#if defined(__HIPCC_RTC__)
+    using uint16_t = hiptensor::uint16_t;
+#endif
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<float16_t>  //////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x1400));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7C00));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFBFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7BFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x0400));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7DFF));
+        return eps.f16;
+    }
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<hfloat16_t>  /////////////
+    ///////////////////////////////////////////////////////////
+#if !HIPTENSOR_NO_HALF
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x1400));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7C00));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFBFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7BFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x0400));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7DFF));
+        return eps.h16;
+    }
+
+#endif // !HIPTENSOR_NO_HALF
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<bfloat16_t>  /////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x3C00));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7F80));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFF7F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7F7F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x007F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
+        return eps.b16;
+    }
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<xfloat32_t>  //////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
+        return eps.xf32;
+    }
+    // @endcond
+
+} // namespace std
+
+namespace hiptensor
+{
+#if !defined(__HIPCC_RTC__)
+    template <typename T, typename std::enable_if_t<std::is_integral<T>::value, int> = 0>
+    constexpr auto maxExactInteger() -> decltype(std::numeric_limits<T>::max())
+    {
+        return std::numeric_limits<T>::max();
+    }
+
+    template <typename T,
+              typename std::enable_if_t<std::is_floating_point<T>::value
+                                            && std::numeric_limits<T>::digits,
+                                        int>
+              = 0>
+    constexpr auto maxExactInteger() ->
+        typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>
+    {
+        using RetT =
+            typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>;
+        return ((RetT)1 << std::numeric_limits<T>::digits);
+    }
+
+    template <typename T,
+              typename std::enable_if_t<
+#if !HIPTENSOR_NO_HALF
+                  std::is_same<T, hfloat16_t>::value ||
+#endif // !HIPTENSOR_NO_HALF
+                      std::is_same<T, float16_t>::value,
+                  int>
+              = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // f16 mantissa is 10 bits
+        return ((int32_t)1 << 11);
+    }
+
+    template <typename T, typename std::enable_if_t<std::is_same<T, bfloat16_t>::value, int> = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // b16 mantissa is 7 bits
+        return ((int32_t)1 << 8);
+    }
+
+    template <typename T, typename std::enable_if_t<std::is_same<T, xfloat32_t>::value, int> = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // xf32 mantissa is 7 bits
+        return ((int32_t)1 << 8);
+    }
+#endif // !defined(__HIPCC_RTC__)
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_TYPE_TRAITS_HPP
diff --git a/library/src/include/types.hpp b/library/src/include/types.hpp
index 8cd677f3..2cc7805c 100644
--- a/library/src/include/types.hpp
+++ b/library/src/include/types.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -19,60 +19,10 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  *
  *******************************************************************************/
-
-#ifndef HIPTENSOR_LIBRARY_TYPES_HPP
-#define HIPTENSOR_LIBRARY_TYPES_HPP
-
-// clang-format off
-// Include order needs to be preserved
-#include <hip/library_types.h>
-#include <hip/hip_bfloat16.h>
-#include <hip/hip_fp16.h>
-#include <iostream>
-
-#include <hiptensor/hiptensor_types.hpp>
-
-// clang-format on
-
-namespace hiptensor
-{
-    // Used to map to empty tensors
-    struct NoneType;
-
-    static constexpr hipDataType NONE_TYPE = (hipDataType)31;
-
-    // Map type to runtime HipDataType
-    template <typename T>
-    struct HipDataType;
-
-    template <typename T>
-    static constexpr auto HipDataType_v = HipDataType<T>::value;
-
-    // Get data size in bytes from id
-    uint32_t hipDataTypeSize(hipDataType id);
-
-    // Convert hipDataType to hiptensorComputeType_t
-    hiptensorComputeType_t convertToComputeType(hipDataType hipType);
-
-    // Read a single value from void pointer, casted to T
-    template <typename T>
-    T readVal(void const* value, hipDataType id);
-
-    template <typename T>
-    T readVal(void const* value, hiptensorComputeType_t id);
-
-} // namespace hiptensor
-
-bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
-bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
-
-bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
-bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
-
-#include "types_impl.hpp"
-
-#endif // HIPTENSOR_LIBRARY_TYPES_HPP
+#include "native_types.hpp"
+#include "type_traits.hpp"
+#include "types_ext.hpp"
diff --git a/library/src/include/types_ext.hpp b/library/src/include/types_ext.hpp
new file mode 100644
index 00000000..e668c17d
--- /dev/null
+++ b/library/src/include/types_ext.hpp
@@ -0,0 +1,195 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_TYPES_EXT_HPP
+#define HIPTENSOR_TYPES_EXT_HPP
+
+#if !defined(__HIPCC_RTC__)
+#include <cmath>
+#include <hip/hip_bfloat16.h>
+#include <limits>
+#include <ostream>
+#else
+#include "utils.hpp"
+#endif // !defined(__HIPCC_RTC__)
+
+#include "type_traits.hpp"
+
+namespace hiptensor
+{
+
+#if !defined(__HIPCC_RTC__)
+
+    ////////////////////////////////////////////////////////////////////////
+    ///////////  hiptensor::hfloat16_t host and device conversions  //////////
+    ////////////////////////////////////////////////////////////////////////
+    template <typename Outgoing,
+              typename Incoming,
+              typename std::enable_if_t<!std::is_same_v<Incoming, Outgoing>, int> = 0>
+    __host__ __device__ inline Outgoing convert(const Incoming& value)
+    {
+#if !HIPTENSOR_NO_HALF
+        if constexpr(std::is_same_v<Outgoing, hfloat16_t>)
+        {
+
+#if defined(__HIP_NO_HALF_CONVERSIONS__)
+            detail::Fp16Bits fp16(static_cast<float16_t>(value));
+            return fp16.h16;
+#else
+            return static_cast<hfloat16_t>(value);
+#endif // defined(__HIP_NO_HALF_CONVERSIONS__)
+        }
+        else if constexpr(std::is_same_v<Incoming, hfloat16_t>)
+        {
+
+#if defined(__HIP_NO_HALF_CONVERSIONS__)
+            detail::Fp16Bits fp16(value);
+            return static_cast<Outgoing>(fp16.f16);
+#else
+            return static_cast<Outgoing>(value);
+#endif // defined(__HIP_NO_HALF_CONVERSIONS__)
+        }
+        else
+#endif // !HIPTENSOR_NO_HALF
+        {
+            return static_cast<Outgoing>(value);
+        }
+    }
+
+    template <typename Outgoing,
+              typename Incoming,
+              typename std::enable_if_t<std::is_same_v<Incoming, Outgoing>, int> = 0>
+    __host__ __device__ inline Outgoing const& convert(const Incoming& value)
+    {
+        return value;
+    }
+
+    ////////////////////////////////////////////////////////////////////
+    ///////////  hiptensor::hfloat16_t host & device operators  //////////
+    ///////////////////////////////////////////////////////////////////
+
+#if defined(__HIP_NO_HALF_OPERATORS__)
+// No operators defined for host or device
+#define HIPTENSOR_HALF_OP_ATTR HIPTENSOR_HOST_DEVICE
+#else
+// No operators defined just for host
+#define HIPTENSOR_HALF_OP_ATTR HIPTENSOR_HOST
+#endif // defined(__HIP_NO_HALF_OPERATORS__)
+
+#if !HIPTENSOR_NO_HALF
+
+    HIPTENSOR_HALF_OP_ATTR inline bool operator==(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        auto absDiff = std::fabs(__half2float(x) - __half2float(y));
+        auto absAdd  = std::fabs(__half2float(x) + __half2float(y));
+        return absDiff <= __half2float(std::numeric_limits<hfloat16_t>::epsilon()) * absAdd * 2.0f
+               || absDiff < __half2float(std::numeric_limits<hfloat16_t>::min());
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline bool operator!=(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return !(x == y);
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator-(const hfloat16_t& x)
+    {
+        detail::Fp16Bits fp16(x);
+        fp16.i16 ^= 0x8000; // Flip sign
+        return fp16.h16;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator+(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) + convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator-(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) - convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator*(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) * convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator/(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) / convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator+=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x + y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator-=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x - y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator*=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x * y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator/=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x / y;
+    }
+
+#endif // !HIPTENSOR_NO_HALF
+
+#endif // !defined(__HIPCC_RTC__)
+
+} // namespace hiptensor
+
+namespace std
+{
+#if !defined(__HIPCC_RTC__)
+    ///////////////////////////////////////////////////////////
+    //////////  std::ostream::operator<<(float16_t)  //////////
+    ///////////////////////////////////////////////////////////
+
+    inline ostream& operator<<(ostream& stream, hiptensor::float16_t const& val)
+    {
+        return stream << static_cast<float>(val);
+    }
+
+    ///////////////////////////////////////////////////////////
+    //////////  std::ostream::operator<<(hfloat16_t)  /////////
+    ///////////////////////////////////////////////////////////
+#if !HIPTENSOR_NO_HALF
+    inline ostream& operator<<(ostream& stream, hiptensor::hfloat16_t const& val)
+    {
+        return stream << __half2float(val);
+    }
+#endif // !HIPTENSOR_NO_HALF
+
+#endif // !defined(__HIPCC_RTC__) && !HIPTENSOR_NO_HALF
+
+} // namespace std
+
+#endif // HIPTENSOR_TYPES_EXT_HPP
diff --git a/library/src/include/xfloat32.hpp b/library/src/include/xfloat32.hpp
new file mode 100644
index 00000000..880d1bee
--- /dev/null
+++ b/library/src/include/xfloat32.hpp
@@ -0,0 +1,349 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+/*!\file
+ * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef
+ */
+
+#ifndef HIPTENSOR_XFLOAT32_HPP
+#define HIPTENSOR_XFLOAT32_HPP
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of hiptensor_xfloat32
+
+#include <stdint.h>
+typedef struct
+{
+    float data;
+} hiptensor_xfloat32;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#if !defined(__HIPCC_RTC__)
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+#else
+
+namespace std
+{
+    using __hip_internal::is_standard_layout;
+    using __hip_internal::is_trivial;
+}
+
+#endif // !defined(__HIPCC_RTC__)
+
+#include "config.hpp"
+
+struct hiptensor_xfloat32
+{
+    float data;
+
+    enum round_t
+    {
+        round_up
+    };
+
+    HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default;
+
+    // round upper 19 bits of IEEE float to convert to xfloat32
+    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t)
+        : data(float_to_xfloat32(f))
+    {
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f)
+        : data(truncate_float_to_xfloat32(f))
+    {
+    }
+
+    // zero extend lower 13 bits of xfloat32 to convert to IEEE float
+    HIPTENSOR_HOST_DEVICE operator float() const
+    {
+        return data;
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator bool() const
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {data};
+        return u.int32 & 0x7fffe000;
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const
+    {
+        return uint32_t(float(*this));
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator long() const
+    {
+        return long(float(*this));
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator double() const
+    {
+        return double(float(*this));
+    }
+
+private:
+    static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus
+            // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd).
+            // This causes the xfloat32's mantissa to be incremented by 1 if the 13
+            // least significant bits of the float mantissa are greater than 0x1000,
+            // or if they are equal to 0x1000 and the least significant bit of the
+            // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already
+            // has the value 0x3ff, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded xfloat32 value. When the xfloat32 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+
+            u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0x1fff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 13 bits of the mantissa are 1, we set the least significant bit
+            // of the xfloat32 mantissa, in order to preserve signaling NaN in case
+            // the xfloat32's mantissa bits are all 0.
+            u.int32 |= 0x2000; // Preserve signaling NaN
+        }
+
+        u.int32 &= 0xffffe000;
+        return u.fp32;
+    }
+
+    // Truncate instead of rounding
+    static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+
+        u.int32 = u.int32 & 0xffffe000;
+        return u.fp32;
+    }
+};
+
+typedef struct
+{
+    float data;
+} hiptensor_xfloat32_public;
+
+static_assert(std::is_standard_layout<hiptensor_xfloat32>{},
+              "hiptensor_xfloat32 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<hiptensor_xfloat32>{},
+              "hiptensor_xfloat32 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+#if !defined(__HIPCC_RTC__)
+static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public)
+                  && offsetof(hiptensor_xfloat32, data)
+                         == offsetof(hiptensor_xfloat32_public, data),
+              "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32");
+#endif // !defined(__HIPCC_RTC__)
+
+#if !defined(__HIPCC_RTC__)
+inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32)
+{
+    return os << float(xf32);
+}
+#endif // !defined(__HIPCC_RTC__)
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a)
+{
+    return a;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a)
+{
+    union
+    {
+        float    fp32;
+        uint32_t int32;
+    } u = {a.data};
+    u.int32 ^= 0x80000000;
+    return hiptensor_xfloat32(u.fp32);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) + float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) - float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) * float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) / float(b));
+}
+inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return float(a) < float(b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return float(a) == float(b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return b < a;
+}
+inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a > b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a == b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a < b);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a + b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a - b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a * b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a / b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a)
+{
+    return a += hiptensor_xfloat32(1.0f);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a)
+{
+    return a -= hiptensor_xfloat32(1.0f);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int)
+{
+    hiptensor_xfloat32 orig = a;
+    ++a;
+    return orig;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int)
+{
+    hiptensor_xfloat32 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000);
+    }
+    constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000);
+    }
+    constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return (u.fp32 == 0.0f);
+    }
+
+    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a)
+    {
+        return hiptensor_xfloat32(sinf(float(a)));
+    }
+    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a)
+    {
+        return hiptensor_xfloat32(cosf(float(a)));
+    }
+
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a)
+    {
+        return a;
+    }
+}
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // HIPTENSOR_XFLOAT32_HPP
diff --git a/library/src/permutation/permutation_ck_impl.hpp b/library/src/permutation/permutation_ck_impl.hpp
index 7878e908..0c73ccff 100644
--- a/library/src/permutation/permutation_ck_impl.hpp
+++ b/library/src/permutation/permutation_ck_impl.hpp
@@ -32,8 +32,8 @@
 #include <ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp>
 #include <ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp>
 
+#include "data_types.hpp"
 #include "performance.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index d64147fe..c1d4a3af 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -29,8 +29,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "data_types.hpp"
 #include "permutation_cpu_reference.hpp"
-#include "types.hpp"
 #include "util.hpp"
 
 namespace hiptensor
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 4ad97610..5d745d12 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -25,7 +25,7 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
-#include "types.hpp"
+#include "data_types.hpp"
 #include "llvm/hiptensor_options.hpp"
 
 #include "contraction/contraction_cpu_reference.hpp"
diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp
index 4323e01d..1f448ff8 100644
--- a/test/02_permutation/permutation_resource.cpp
+++ b/test/02_permutation/permutation_resource.cpp
@@ -28,7 +28,7 @@
 #define HIPTENSOR_PERMUTATION_RESOURCE_IMPL_HPP
 
 #include "permutation_resource.hpp"
-#include "types.hpp"
+#include "data_types.hpp"
 #include "utils.hpp"
 
 namespace hiptensor
diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp
index dbb52d6e..cfadf5c0 100644
--- a/test/02_permutation/permutation_test.cpp
+++ b/test/02_permutation/permutation_test.cpp
@@ -25,10 +25,10 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
+#include "data_types.hpp"
 #include "logger.hpp"
 #include "permutation/permutation_cpu_reference.hpp"
 #include "permutation_test.hpp"
-#include "types.hpp"
 #include "utils.hpp"
 #include "llvm/hiptensor_options.hpp"
 
diff --git a/test/utils.hpp b/test/utils.hpp
index 67418568..1f7ece44 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -43,6 +43,7 @@
 #include <hiptensor/internal/hiptensor_utility.hpp>
 
 #include "device/common.hpp"
+#include "types.hpp"
 
 #define HIPTENSOR_FREE_DEVICE(ptr)     \
     if(ptr != nullptr)                 \
@@ -209,19 +210,6 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
     return std::make_pair(retval, max_relative_error);
 }
 
-template <typename DDataType>
-double getEpsilon()
-{
-    if(std::is_same_v<DDataType, _Float16>)
-    {
-        return 0.0009765625; // numeric_limits<_Float16>::epsilon() => 0
-    }
-    else
-    {
-        return std::numeric_limits<DDataType>::epsilon();
-    }
-};
-
 template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
@@ -288,7 +276,7 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = getEpsilon<DDataType>();
+    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;