ROCm · ScXfjiang · Oct 22, 2024 · Oct 21, 2024
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
@@ -46,6 +46,8 @@ limitations under the License.
 #include "tensorflow/tsl/util/determinism.h"
 #include "tensorflow/tsl/util/env_var.h"
 #include "rocm/rocm_config.h"
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
 
 namespace {
 
@@ -3806,6 +3808,28 @@ bool MIOpenSupport::GetRnnAlgorithms(
   return true;
 }
 
+bool MIOpenSupport::DoBatchNormalizationForward(
+    Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+    const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+    const DeviceMemory<float>& estimated_mean,
+    const DeviceMemory<float>& estimated_variance,
+    const DeviceMemory<Eigen::bfloat16>& side_input,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    const double exponential_average_factor,
+    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+    DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+    DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+    bool is_training, ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator) {
+
+  return DoBatchNormalizationForwardImpl<Eigen::bfloat16, float>(
+      stream, dnn::DataType::kBF16, dnn::DataType::kFloat, x, scale, offset,
+      estimated_mean, estimated_variance, side_input, x_desc, scale_offset_desc,
+      epsilon, exponential_average_factor, activation_mode, y, batch_mean,
+      batch_var, saved_mean, saved_inv_var, is_training);
+}
+
 bool MIOpenSupport::DoBatchNormalizationForward(
     Stream* stream, const DeviceMemory<Eigen::half>& x,
     const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
@@ -3896,6 +3920,26 @@ bool MIOpenSupport::DoBatchNormalizationForwardImpl(
   return true;
 }
 
+bool MIOpenSupport::DoBatchNormalizationBackward(
+    Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+    const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+    const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+    const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::bfloat16>& y,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    dnn::ActivationMode activation_mode,
+    DeviceMemory<Eigen::bfloat16>* x_backprop,
+    DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+    DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+    DeviceMemory<uint8_t>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
+
+return DoBatchNormalizationBackwardImpl<Eigen::bfloat16, float>(
+      stream, miopenBFloat16, miopenFloat, y_backprop, x, scale, mean, inv_var,
+      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      offset_backprop);
+}
+
 bool MIOpenSupport::DoBatchNormalizationBackward(
     Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
     const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,

diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
@@ -298,6 +298,21 @@ class MIOpenSupport : public dnn::DnnSupport {
       bool is_training, ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) override;
 
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
@@ -325,6 +340,21 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<uint8>* reserve_space_data,
       ScratchAllocator* workspace_allocator) override;
 
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
   tsl::Status DoConvolve(
       dnn::ConvolutionKind kind, dnn::DataType element_type,
       dnn::DataType output_type, Stream* stream,

diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <vector>
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -29,60 +30,42 @@ limitations under the License.
 
 namespace tensorflow {
 
-class BatchNormOpTest : public OpsTestBase {};
-
-TEST_F(BatchNormOpTest, Simple) {
-  TF_EXPECT_OK(
-      NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("scale_after_normalization", false)
-          .Attr("variance_epsilon", 0.001)
-          .Finalize(node_def()));
-  TF_EXPECT_OK(InitOpWithGraphVersion(8));
-  AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
-                           {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
-  AddInputFromArray<float>(TensorShape({2}), {10, 20});
-  AddInputFromArray<float>(TensorShape({2}), {0.25f, 0.5f});
-  AddInputFromArray<float>(TensorShape({2}), {0.1f, 0.6f});
-  AddInputFromArray<float>(TensorShape({2}), {0.0f, 0.0f});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
-  test::FillValues<float>(
-      &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
-}
-
-TEST_F(BatchNormOpTest, Fp16) {
-  TF_EXPECT_OK(
-      NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Attr("scale_after_normalization", false)
-          .Attr("variance_epsilon", 0.001)
-          .Finalize(node_def()));
-  TF_EXPECT_OK(InitOpWithGraphVersion(8));
-  AddInputFromList<Eigen::half>(TensorShape({1, 1, 6, 2}),
-                                {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {10, 20});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.25, 0.5});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.1, 0.6});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.0, 0.0});
-  TF_ASSERT_OK(RunOpKernel());
+template <typename T>
+struct BatchNormOpTest : public OpsTestBase {
+  static constexpr auto TValueType = DataTypeToEnum<T>::value;
+  void run_me() {
+    TF_EXPECT_OK(
+        NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Attr("scale_after_normalization", false)
+            .Attr("variance_epsilon", 0.001)
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOpWithGraphVersion(8));
+    AddInputFromList<T>(TensorShape({1, 1, 6, 2}),
+                        {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
+    AddInputFromList<T>(TensorShape({2}), {10, 20});
+    AddInputFromList<T>(TensorShape({2}), {0.25, 0.5});
+    AddInputFromList<T>(TensorShape({2}), {0.1, 0.6});
+    AddInputFromList<T>(TensorShape({2}), {0.0, 0.0});
+    TF_ASSERT_OK(RunOpKernel());
+    double atol = TValueType == DT_FLOAT ? 0.01 : 0.1;
+    Tensor expected(allocator(), TValueType, TensorShape({1, 1, 6, 2}));
+    test::FillValues<T>(&expected,
+                        {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f,
+                         -21.86f, -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), atol);
+  }
+};
 
-  Tensor expected(allocator(), DT_HALF, TensorShape({1, 1, 6, 2}));
-  test::FillValues<Eigen::half>(
-      &expected, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86,
-                  -33.31, -23.85, -34.72, -25.85, -36.13});
-  test::ExpectTensorNear<Eigen::half>(expected, *GetOutput(0), 0.1);
-}
+TYPED_TEST_SUITE_P(BatchNormOpTest);
+TYPED_TEST_P(BatchNormOpTest, Simple) { this->run_me(); }
 
+REGISTER_TYPED_TEST_SUITE_P(BatchNormOpTest, Simple);
+// TODO(ezhulenev): Add support for more data types.
+using DataTypes = ::testing::Types<float, Eigen::half>;  //, Eigen::bfloat16>;
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, BatchNormOpTest, DataTypes);
 }  // namespace tensorflow